From 7d30c9dd5e0fc26cfa08d6e0d647a6123a1bb4cf Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Mon, 1 Jun 2026 16:56:03 -0400 Subject: [PATCH 01/26] Add AgentKit ASR interaction language handling Add finite interaction language support at the AgentKit layer and always serialize asr.language with an en-US default for cascading agents. Keep provider-specific STT language values under asr.params and align STT vendor parameter serialization across TypeScript, Python, and Go. Add focused tests covering default interaction language behavior, explicit interaction language overrides, and documented STT provider params. --- src/agora_agent/agentkit/__init__.py | 2 + src/agora_agent/agentkit/agent.py | 90 ++++++++++- src/agora_agent/agentkit/vendors/stt.py | 198 +++++++++++++++++------- tests/custom/test_stt_language.py | 112 ++++++++++++++ 4 files changed, 342 insertions(+), 60 deletions(-) create mode 100644 tests/custom/test_stt_language.py diff --git a/src/agora_agent/agentkit/__init__.py b/src/agora_agent/agentkit/__init__.py index 712d0dd..ff20d29 100644 --- a/src/agora_agent/agentkit/__init__.py +++ b/src/agora_agent/agentkit/__init__.py @@ -3,6 +3,7 @@ AgentConfig, AgentConfigUpdate, AsrConfig, + InteractionLanguage, ConversationHistory, ConversationRole, ConversationSessionTurn, @@ -204,6 +205,7 @@ "LlmStyle", "SttConfig", "AsrConfig", + "InteractionLanguage", "SttVendor", "TtsConfig", "MllmConfig", diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index f84862c..7647818 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -214,6 +214,44 @@ class SessionOptions(typing_extensions.TypedDict, total=False): from .token import generate_convo_ai_token, _parse_numeric_uid, _validate_expires_in +InteractionLanguage = typing_extensions.Literal[ + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", +] + +DEFAULT_INTERACTION_LANGUAGE: InteractionLanguage = "en-US" +_INTERACTION_LANGUAGES = set(InteractionLanguage.__args__) + def _dump_optional_model(value: typing.Any) -> typing.Any: if hasattr(value, "model_dump"): @@ -223,12 +261,22 @@ def _dump_optional_model(value: typing.Any) -> typing.Any: return value +def _is_interaction_language(value: typing.Any) -> bool: + return isinstance(value, str) and value in _INTERACTION_LANGUAGES + + class Agent: """A reusable agent definition. Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) to configure vendor settings after construction. + Deprecated: + The Agent-level ``instructions``, ``greeting``, ``failure_message``, + ``max_history``, and ``greeting_configs`` convenience fields are kept + for compatibility. Configure those values on the LLM or MLLM vendor + instead. + Examples -------- >>> from agora_agent import Agent, OpenAI, ElevenLabsTTS, DeepgramSTT @@ -251,6 +299,7 @@ def __init__( sal: typing.Optional[SalConfig] = None, advanced_features: typing.Optional[AdvancedFeatures] = None, parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + interaction_language: typing.Optional[InteractionLanguage] = None, greeting: typing.Optional[str] = None, failure_message: typing.Optional[str] = None, max_history: typing.Optional[int] = None, @@ -277,6 +326,7 @@ def __init__( self._sal = sal self._advanced_features = advanced_features self._parameters = parameters + self._interaction_language = interaction_language self._geofence = geofence self._labels = labels self._rtc = rtc @@ -310,6 +360,16 @@ def with_stt(self, vendor: BaseSTT) -> "Agent": new_agent._stt = vendor.to_config() return new_agent + def with_interaction_language(self, language: InteractionLanguage) -> "Agent": + """Returns a new Agent with the Agora interaction language. + + This serializes to ``asr.language``. Vendor-specific language values + remain under ``asr.params``, for example ``asr.params.language``. + """ + new_agent = self._clone() + new_agent._interaction_language = language + return new_agent + def with_mllm(self, vendor: BaseMLLM) -> "Agent": # Note: avatars are not supported with MLLM. The combination is rejected # at ``to_properties`` / ``AgentSession.start`` so callers can still @@ -369,17 +429,19 @@ def with_interruption(self, config: InterruptionConfig) -> "Agent": return new_agent def with_instructions(self, instructions: str) -> "Agent": + """Deprecated. Configure system messages on the LLM vendor instead.""" new_agent = self._clone() new_agent._instructions = instructions return new_agent def with_greeting(self, greeting: str) -> "Agent": + """Deprecated. Configure the greeting on the LLM or MLLM vendor instead.""" new_agent = self._clone() new_agent._greeting = greeting return new_agent def with_greeting_configs(self, configs: LlmGreetingConfigs) -> "Agent": - """Returns a new Agent with greeting playback configuration.""" + """Deprecated. Configure greeting playback on the LLM vendor instead.""" new_agent = self._clone() new_agent._greeting_configs = configs return new_agent @@ -448,16 +510,13 @@ def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent return new_agent def with_failure_message(self, message: str) -> "Agent": - """Returns a new Agent with the specified failure message. - - The failure message is played via TTS when the LLM call fails. - """ + """Deprecated. Configure the failure message on the LLM or MLLM vendor instead.""" new_agent = self._clone() new_agent._failure_message = message return new_agent def with_max_history(self, max_history: int) -> "Agent": - """Returns a new Agent with the specified maximum conversation history length.""" + """Deprecated. Configure max history on the LLM vendor instead.""" new_agent = self._clone() new_agent._max_history = max_history return new_agent @@ -606,6 +665,10 @@ def rtc(self) -> typing.Optional[RtcConfig]: def filler_words(self) -> typing.Optional[FillerWordsConfig]: return self._filler_words + @property + def interaction_language(self) -> typing.Optional[InteractionLanguage]: + return self._interaction_language + @property def config(self) -> typing.Dict[str, typing.Any]: return { @@ -624,6 +687,7 @@ def config(self) -> typing.Dict[str, typing.Any]: "avatar": self._avatar, "advanced_features": self._advanced_features, "parameters": self._parameters, + "interaction_language": self._interaction_language, "geofence": self._geofence, "labels": self._labels, "rtc": self._rtc, @@ -804,6 +868,8 @@ def to_properties( base_kwargs["mllm"] = mllm_config return StartAgentsRequestProperties(**base_kwargs) + base_kwargs["asr"] = self._resolve_asr_config() + if skip_vendor_validation: return StartAgentsRequestProperties(**base_kwargs) @@ -829,11 +895,18 @@ def to_properties( base_kwargs["llm"] = llm_config base_kwargs["tts"] = self._tts - if self._stt is not None: - base_kwargs["asr"] = self._stt return StartAgentsRequestProperties(**base_kwargs) + def _resolve_asr_config(self) -> typing.Dict[str, typing.Any]: + asr_config = dict(self._stt or {}) + existing_language = asr_config.get("language") + language = self._interaction_language + if language is None: + language = existing_language if _is_interaction_language(existing_language) else DEFAULT_INTERACTION_LANGUAGE + asr_config["language"] = language + return asr_config + def _clone(self) -> "Agent": new_agent = Agent.__new__(Agent) new_agent._name = self._name @@ -849,6 +922,7 @@ def _clone(self) -> "Agent": new_agent._sal = self._sal new_agent._advanced_features = self._advanced_features new_agent._parameters = self._parameters + new_agent._interaction_language = self._interaction_language new_agent._instructions = self._instructions new_agent._greeting = self._greeting new_agent._failure_message = self._failure_message diff --git a/src/agora_agent/agentkit/vendors/stt.py b/src/agora_agent/agentkit/vendors/stt.py index a26e130..73acc44 100644 --- a/src/agora_agent/agentkit/vendors/stt.py +++ b/src/agora_agent/agentkit/vendors/stt.py @@ -1,16 +1,64 @@ from typing import Any, Dict, Optional from pydantic import BaseModel, ConfigDict, Field +from typing_extensions import Literal from .base import BaseSTT +InteractionLanguage = Literal[ + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", +] + +_INTERACTION_LANGUAGES = set(InteractionLanguage.__args__) + + +def _interaction_language(language: Optional[str], interaction_language: Optional[InteractionLanguage]) -> Optional[InteractionLanguage]: + if interaction_language is not None: + return interaction_language + if language in _INTERACTION_LANGUAGES: + return language # type: ignore[return-value] + return None + class SpeechmaticsSTTOptions(BaseModel): model_config = ConfigDict(extra="forbid") api_key: str = Field(..., description="Speechmatics API key") language: str = Field(..., description="Language code (e.g., en, es, fr)") + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") model: Optional[str] = Field(default=None, description="Model name") + uri: Optional[str] = Field(default=None, description="Speechmatics streaming WebSocket URL") additional_params: Optional[Dict[str, Any]] = Field(default=None) class SpeechmaticsSTT(BaseSTT): @@ -18,20 +66,24 @@ def __init__(self, **kwargs: Any): self.options = SpeechmaticsSTTOptions(**kwargs) def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = { + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ "api_key": self.options.api_key, "language": self.options.language, - } + }) if self.options.model is not None: params["model"] = self.options.model - if self.options.additional_params is not None: - params.update(self.options.additional_params) + if self.options.uri is not None: + params["uri"] = self.options.uri - return { + config: Dict[str, Any] = { "vendor": "speechmatics", - "language": self.options.language, "params": params, } + interaction_language = _interaction_language(self.options.language, self.options.interaction_language) + if interaction_language is not None: + config["language"] = interaction_language + return config class DeepgramSTTOptions(BaseModel): @@ -40,6 +92,7 @@ class DeepgramSTTOptions(BaseModel): api_key: Optional[str] = Field(default=None, description="Deepgram API key") model: Optional[str] = Field(default=None, description="Model (e.g., nova-2, enhanced, base)") language: Optional[str] = Field(default=None, description="Language code (e.g., en-US)") + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") smart_format: Optional[bool] = Field(default=None, description="Enable smart formatting") punctuation: Optional[bool] = Field(default=None, description="Enable punctuation") additional_params: Optional[Dict[str, Any]] = Field(default=None) @@ -49,10 +102,10 @@ def __init__(self, **kwargs: Any): self.options = DeepgramSTTOptions(**kwargs) def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = {} + params: Dict[str, Any] = dict(self.options.additional_params or {}) if self.options.api_key is not None: - params["api_key"] = self.options.api_key + params["key"] = self.options.api_key if self.options.model is not None: params["model"] = self.options.model if self.options.language is not None: @@ -61,14 +114,14 @@ def to_config(self) -> Dict[str, Any]: params["smart_format"] = self.options.smart_format if self.options.punctuation is not None: params["punctuation"] = self.options.punctuation - if self.options.additional_params is not None: - params.update(self.options.additional_params) - - return { + config: Dict[str, Any] = { "vendor": "deepgram", - "language": self.options.language, "params": params, } + interaction_language = _interaction_language(self.options.language, self.options.interaction_language) + if interaction_language is not None: + config["language"] = interaction_language + return config class MicrosoftSTTOptions(BaseModel): @@ -77,6 +130,7 @@ class MicrosoftSTTOptions(BaseModel): key: str = Field(..., description="Azure subscription key") region: str = Field(..., description="Azure region (e.g., eastus)") language: Optional[str] = Field(default=None, description="Language code (e.g., en-US)") + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") additional_params: Optional[Dict[str, Any]] = Field(default=None) class MicrosoftSTT(BaseSTT): @@ -84,20 +138,22 @@ def __init__(self, **kwargs: Any): self.options = MicrosoftSTTOptions(**kwargs) def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = { + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ "key": self.options.key, "region": self.options.region, - } + }) if self.options.language is not None: params["language"] = self.options.language - if self.options.additional_params is not None: - params.update(self.options.additional_params) - return { + config: Dict[str, Any] = { "vendor": "microsoft", - "language": self.options.language, "params": params, } + interaction_language = _interaction_language(self.options.language, self.options.interaction_language) + if interaction_language is not None: + config["language"] = interaction_language + return config class OpenAISTTOptions(BaseModel): @@ -106,6 +162,9 @@ class OpenAISTTOptions(BaseModel): api_key: str = Field(..., description="OpenAI API key") model: Optional[str] = Field(default=None, description="Model (default: whisper-1)") language: Optional[str] = Field(default=None, description="Language code") + prompt: Optional[str] = Field(default=None, description="Prompt that guides OpenAI transcription") + input_audio_transcription: Optional[Dict[str, Any]] = Field(default=None, description="OpenAI transcription settings") + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") additional_params: Optional[Dict[str, Any]] = Field(default=None) class OpenAISTT(BaseSTT): @@ -113,25 +172,38 @@ def __init__(self, **kwargs: Any): self.options = OpenAISTTOptions(**kwargs) def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = {"api_key": self.options.api_key} + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["api_key"] = self.options.api_key + transcription = dict(self.options.input_audio_transcription or {}) if self.options.model is not None: - params["model"] = self.options.model - if self.options.additional_params is not None: - params.update(self.options.additional_params) + transcription["model"] = self.options.model + if self.options.prompt is not None: + transcription["prompt"] = self.options.prompt + if self.options.language is not None: + transcription["language"] = self.options.language + if transcription: + params["input_audio_transcription"] = transcription - return { + config: Dict[str, Any] = { "vendor": "openai", - "language": self.options.language, "params": params, } + interaction_language = _interaction_language(self.options.language, self.options.interaction_language) + if interaction_language is not None: + config["language"] = interaction_language + return config class GoogleSTTOptions(BaseModel): model_config = ConfigDict(extra="forbid") - api_key: str = Field(..., description="Google Cloud API key") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud region") + adc_credentials_string: str = Field(..., description="Google service account credentials JSON string") language: Optional[str] = Field(default=None, description="Language code (e.g., en-US)") + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") + model: Optional[str] = Field(default=None, description="Recognition model") additional_params: Optional[Dict[str, Any]] = Field(default=None) class GoogleSTT(BaseSTT): @@ -139,18 +211,26 @@ def __init__(self, **kwargs: Any): self.options = GoogleSTTOptions(**kwargs) def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = {"api_key": self.options.api_key} + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ + "project_id": self.options.project_id, + "location": self.options.location, + "adc_credentials_string": self.options.adc_credentials_string, + }) if self.options.language is not None: params["language"] = self.options.language - if self.options.additional_params is not None: - params.update(self.options.additional_params) + if self.options.model is not None: + params["model"] = self.options.model - return { + config: Dict[str, Any] = { "vendor": "google", - "language": self.options.language, "params": params, } + interaction_language = _interaction_language(self.options.language, self.options.interaction_language) + if interaction_language is not None: + config["language"] = interaction_language + return config class AmazonSTTOptions(BaseModel): @@ -160,6 +240,7 @@ class AmazonSTTOptions(BaseModel): secret_key: str = Field(..., description="AWS Secret Access Key") region: str = Field(..., description="AWS region (e.g., us-east-1)") language: Optional[str] = Field(default=None, description="Language code") + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") additional_params: Optional[Dict[str, Any]] = Field(default=None) class AmazonSTT(BaseSTT): @@ -167,21 +248,23 @@ def __init__(self, **kwargs: Any): self.options = AmazonSTTOptions(**kwargs) def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = { - "access_key": self.options.access_key, - "secret_key": self.options.secret_key, + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ + "access_key_id": self.options.access_key, + "secret_access_key": self.options.secret_key, "region": self.options.region, - } + }) if self.options.language is not None: - params["language"] = self.options.language - if self.options.additional_params is not None: - params.update(self.options.additional_params) + params["language_code"] = self.options.language - return { + config: Dict[str, Any] = { "vendor": "amazon", - "language": self.options.language, "params": params, } + interaction_language = _interaction_language(self.options.language, self.options.interaction_language) + if interaction_language is not None: + config["language"] = interaction_language + return config class AssemblyAISTTOptions(BaseModel): @@ -189,6 +272,8 @@ class AssemblyAISTTOptions(BaseModel): api_key: str = Field(..., description="AssemblyAI API key") language: Optional[str] = Field(default=None, description="Language code") + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") + uri: Optional[str] = Field(default=None, description="AssemblyAI streaming WebSocket URL") additional_params: Optional[Dict[str, Any]] = Field(default=None) class AssemblyAISTT(BaseSTT): @@ -196,21 +281,27 @@ def __init__(self, **kwargs: Any): self.options = AssemblyAISTTOptions(**kwargs) def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = {"api_key": self.options.api_key} - if self.options.additional_params is not None: - params.update(self.options.additional_params) + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["api_key"] = self.options.api_key + if self.options.language is not None: + params["language"] = self.options.language + if self.options.uri is not None: + params["uri"] = self.options.uri - return { + config: Dict[str, Any] = { "vendor": "assemblyai", - "language": self.options.language, "params": params, } + interaction_language = _interaction_language(self.options.language, self.options.interaction_language) + if interaction_language is not None: + config["language"] = interaction_language + return config class AresSTTOptions(BaseModel): model_config = ConfigDict(extra="forbid") - language: Optional[str] = Field(default=None, description="Language code") + language: Optional[InteractionLanguage] = Field(default=None, description="Language code") additional_params: Optional[Dict[str, Any]] = Field(default=None) class AresSTT(BaseSTT): @@ -231,6 +322,7 @@ class SarvamSTTOptions(BaseModel): api_key: str = Field(..., description="Sarvam API key") language: str = Field(..., description="Language code (e.g., en, hi, ta)") + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") model: Optional[str] = Field(default=None, description="Model name") additional_params: Optional[Dict[str, Any]] = Field(default=None) @@ -239,17 +331,19 @@ def __init__(self, **kwargs: Any): self.options = SarvamSTTOptions(**kwargs) def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = { + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ "api_key": self.options.api_key, "language": self.options.language, - } + }) if self.options.model is not None: params["model"] = self.options.model - if self.options.additional_params is not None: - params.update(self.options.additional_params) - return { + config: Dict[str, Any] = { "vendor": "sarvam", - "language": self.options.language, "params": params, } + interaction_language = _interaction_language(self.options.language, self.options.interaction_language) + if interaction_language is not None: + config["language"] = interaction_language + return config diff --git a/tests/custom/test_stt_language.py b/tests/custom/test_stt_language.py new file mode 100644 index 0000000..1573fc5 --- /dev/null +++ b/tests/custom/test_stt_language.py @@ -0,0 +1,112 @@ +from agora_agent import ( + Agent, + AmazonSTT, + AssemblyAISTT, + DeepgramSTT, + ElevenLabsTTS, + GoogleSTT, + OpenAI, + OpenAISTT, + SpeechmaticsSTT, +) + + +def dump(value): + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + return value.dict(exclude_none=True) + + +def base_agent() -> Agent: + return ( + Agent() + .with_llm(OpenAI(api_key="llm-key", model="gpt-4o-mini")) + .with_tts(ElevenLabsTTS(key="tts-key", voice_id="voice", model_id="eleven_flash_v2_5")) + ) + + +def properties(agent: Agent) -> dict: + return dump( + agent.to_properties( + channel="channel", + token="token", + agent_uid="1001", + remote_uids=["1002"], + ) + ) + + +def test_bcp47_stt_language_sets_asr_language_and_provider_param() -> None: + props = properties(base_agent().with_stt(SpeechmaticsSTT(api_key="stt-key", language="en-US"))) + + assert props["asr"]["vendor"] == "speechmatics" + assert props["asr"]["language"] == "en-US" + assert props["asr"]["params"]["language"] == "en-US" + + +def test_provider_language_defaults_interaction_language_when_not_supported_by_ares() -> None: + props = properties(base_agent().with_stt(SpeechmaticsSTT(api_key="stt-key", language="en"))) + + assert props["asr"]["vendor"] == "speechmatics" + assert props["asr"]["language"] == "en-US" + assert props["asr"]["params"]["language"] == "en" + assert "turn_detection" not in props + + +def test_explicit_interaction_language_can_differ_from_provider_language() -> None: + props = properties( + base_agent() + .with_interaction_language("fr-FR") + .with_stt(SpeechmaticsSTT(api_key="stt-key", language="en")) + ) + + assert props["asr"]["language"] == "fr-FR" + assert props["asr"]["params"]["language"] == "en" + + +def test_default_interaction_language_is_sent_without_stt() -> None: + props = properties(base_agent()) + + assert props["asr"] == {"language": "en-US"} + + +def test_stt_vendor_params_match_documented_shapes() -> None: + assert DeepgramSTT(api_key="dg-key", language="en").to_config()["params"] == { + "key": "dg-key", + "language": "en", + } + + assert OpenAISTT(api_key="openai-key", model="gpt-4o-mini-transcribe", language="en").to_config()["params"] == { + "api_key": "openai-key", + "input_audio_transcription": { + "model": "gpt-4o-mini-transcribe", + "language": "en", + }, + } + + assert GoogleSTT( + project_id="project", + location="global", + adc_credentials_string="{}", + language="en-US", + model="long", + ).to_config()["params"] == { + "project_id": "project", + "location": "global", + "adc_credentials_string": "{}", + "language": "en-US", + "model": "long", + } + + assert AmazonSTT(access_key="access", secret_key="secret", region="us-east-1", language="en-US").to_config()["params"] == { + "access_key_id": "access", + "secret_access_key": "secret", + "region": "us-east-1", + "language_code": "en-US", + } + + assert AssemblyAISTT(api_key="assembly-key", language="en-US", uri="wss://example.test/ws").to_config()["params"] == { + "api_key": "assembly-key", + "language": "en-US", + "uri": "wss://example.test/ws", + } From a95214ebbd630bc2493ef762e5a506d9df796db4 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Mon, 1 Jun 2026 16:57:40 -0400 Subject: [PATCH 02/26] Document AgentKit ASR language and STT params Update vendor documentation across TypeScript, Python, and Go to explain that AgentKit interaction language maps to asr.language while provider language settings remain under asr.params. Refresh STT vendor parameter tables to match the serialized provider request shapes. --- docs/concepts/vendors.md | 4 +++- docs/reference/vendors.md | 20 ++++++++++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/docs/concepts/vendors.md b/docs/concepts/vendors.md index 8d58cd1..a6268f8 100644 --- a/docs/concepts/vendors.md +++ b/docs/concepts/vendors.md @@ -74,13 +74,15 @@ tts = ElevenLabsTTS( Used with `agent.with_stt()`. +Use `agent.with_interaction_language()` for Agora `asr.language`; it defaults to `en-US`. STT vendor `language` options are serialized under `asr.params` using each provider's own format. + | Class | Provider | Required Parameters | |---|---|---| | `SpeechmaticsSTT` | Speechmatics | `api_key`, `language` | | `DeepgramSTT` | Deepgram | — (all optional) | | `MicrosoftSTT` | Microsoft Azure | `key`, `region` | | `OpenAISTT` | OpenAI | `api_key` | -| `GoogleSTT` | Google Cloud | `api_key` | +| `GoogleSTT` | Google Cloud | `project_id`, `location`, `adc_credentials_string` | | `AmazonSTT` | Amazon Transcribe | `access_key`, `secret_key`, `region` | | `AssemblyAISTT` | AssemblyAI | `api_key` | | `AresSTT` | Ares | — (all optional) | diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index 7395eea..4747af2 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -287,12 +287,16 @@ Fixed sample rate: 24000 Hz. ## STT Vendors +Use `agent.with_interaction_language()` for Agora `asr.language`; it defaults to `en-US`. Provider-specific language values remain under `asr.params` and may use a different format. + ### `SpeechmaticsSTT` | Parameter | Type | Required | Default | Description | |---|---|---|---|---| | `api_key` | `str` | Yes | — | Speechmatics API key | | `language` | `str` | Yes | — | Language code (e.g., `en`) | +| `interaction_language` | `str` | No | `None` | Agora `asr.language` override | +| `uri` | `str` | No | `None` | Speechmatics streaming WebSocket URL | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | ### `DeepgramSTT` @@ -302,6 +306,7 @@ Fixed sample rate: 24000 Hz. | `api_key` | `str` | No | `None` | Deepgram API key | | `model` | `str` | No | `None` | Model (e.g., `nova-2`) | | `language` | `str` | No | `None` | Language code (e.g., `en-US`) | +| `interaction_language` | `str` | No | `None` | Agora `asr.language` override | | `smart_format` | `bool` | No | `None` | Enable smart formatting | | `punctuation` | `bool` | No | `None` | Enable punctuation | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | @@ -313,6 +318,7 @@ Fixed sample rate: 24000 Hz. | `key` | `str` | Yes | — | Azure subscription key | | `region` | `str` | Yes | — | Azure region (e.g., `eastus`) | | `language` | `str` | No | `None` | Language code (e.g., `en-US`) | +| `interaction_language` | `str` | No | `None` | Agora `asr.language` override | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | ### `OpenAISTT` @@ -322,14 +328,21 @@ Fixed sample rate: 24000 Hz. | `api_key` | `str` | Yes | — | OpenAI API key | | `model` | `str` | No | `None` | Model (default: `whisper-1`) | | `language` | `str` | No | `None` | Language code | +| `prompt` | `str` | No | `None` | Prompt for OpenAI transcription | +| `input_audio_transcription` | `Dict[str, Any]` | No | `None` | OpenAI transcription settings | +| `interaction_language` | `str` | No | `None` | Agora `asr.language` override | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | ### `GoogleSTT` | Parameter | Type | Required | Default | Description | |---|---|---|---|---| -| `api_key` | `str` | Yes | — | Google Cloud API key | +| `project_id` | `str` | Yes | — | Google Cloud project ID | +| `location` | `str` | Yes | — | Google Cloud region | +| `adc_credentials_string` | `str` | Yes | — | Google service account credentials JSON string | | `language` | `str` | No | `None` | Language code (e.g., `en-US`) | +| `interaction_language` | `str` | No | `None` | Agora `asr.language` override | +| `model` | `str` | No | `None` | Recognition model | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | ### `AmazonSTT` @@ -339,7 +352,8 @@ Fixed sample rate: 24000 Hz. | `access_key` | `str` | Yes | — | AWS Access Key ID | | `secret_key` | `str` | Yes | — | AWS Secret Access Key | | `region` | `str` | Yes | — | AWS region (e.g., `us-east-1`) | -| `language` | `str` | No | `None` | Language code | +| `language` | `str` | No | `None` | Amazon `language_code` | +| `interaction_language` | `str` | No | `None` | Agora `asr.language` override | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | ### `AssemblyAISTT` @@ -348,6 +362,8 @@ Fixed sample rate: 24000 Hz. |---|---|---|---|---| | `api_key` | `str` | Yes | — | AssemblyAI API key | | `language` | `str` | No | `None` | Language code | +| `interaction_language` | `str` | No | `None` | Agora `asr.language` override | +| `uri` | `str` | No | `None` | AssemblyAI streaming WebSocket URL | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | ### `AresSTT` From eeac05d03f833a2ff6a7a8394409583bffe47410 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Mon, 1 Jun 2026 17:09:14 -0400 Subject: [PATCH 03/26] Move prompt and greeting docs to vendor config Update AgentKit examples and reference docs to configure prompts, greetings, failure messages, and max history on LLM or MLLM vendor options instead of the top-level Agent fields. Mark the Agent-level helpers as compatibility shims and keep examples aligned with the serialized request shape. --- README.md | 14 +++---- docs/concepts/agent.md | 51 ++++++++++++++++---------- docs/concepts/session.md | 8 +++- docs/getting-started/authentication.md | 7 +++- docs/getting-started/quick-start.md | 14 +++---- docs/guides/agent-builder-features.md | 48 +++++++++++------------- docs/guides/avatars.md | 26 +++++++++---- docs/guides/byok.md | 10 ++--- docs/guides/cascading-flow.md | 32 ++++++++++------ docs/reference/agent.md | 26 +++++++------ 10 files changed, 133 insertions(+), 103 deletions(-) diff --git a/README.md b/README.md index 983932b..cd43021 100644 --- a/README.md +++ b/README.md @@ -58,10 +58,6 @@ def start_conversation() -> str: agent = Agent( name=f"conversation-{int(time.time())}", - instructions=AGENT_PROMPT, - greeting=GREETING, - failure_message="Please wait a moment.", - max_history=50, turn_detection={ "config": { "speech_threshold": 0.5, @@ -96,9 +92,10 @@ def start_conversation() -> str: ).with_llm( OpenAI( model="gpt-4o-mini", + system_messages=[{"role": "system", "content": AGENT_PROMPT}], greeting_message=GREETING, failure_message="Please wait a moment.", - max_history=15, + max_history=50, params={ "max_tokens": 1024, "temperature": 0.7, @@ -134,10 +131,7 @@ def start_conversation() -> str: Use the same `Agent` builder shape, but provide credentials explicitly when you want vendor-managed billing and routing instead of Agora-managed models. ```python -agent = Agent( - instructions=AGENT_PROMPT, - greeting=GREETING, -).with_stt( +agent = Agent().with_stt( DeepgramSTT( api_key=os.environ["DEEPGRAM_API_KEY"], model="nova-3", @@ -147,6 +141,8 @@ agent = Agent( OpenAI( api_key=os.environ["OPENAI_API_KEY"], model="gpt-4o-mini", + system_messages=[{"role": "system", "content": AGENT_PROMPT}], + greeting_message=GREETING, max_tokens=1024, temperature=0.7, top_p=0.95, diff --git a/docs/concepts/agent.md b/docs/concepts/agent.md index dd9d3ed..b89f08c 100644 --- a/docs/concepts/agent.md +++ b/docs/concepts/agent.md @@ -12,24 +12,27 @@ The `Agent` class is a fluent builder for configuring AI agent properties. It co ```python -from agora_agent import Agent - -agent = Agent( - name='support-assistant', - instructions='You are a helpful voice assistant.', - greeting='Hello! How can I help you?', - failure_message='Sorry, something went wrong.', - max_history=20, +from agora_agent import Agent, OpenAI + +agent = Agent(name='support-assistant').with_llm( + OpenAI( + api_key='your-openai-key', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a helpful voice assistant.'}], + greeting_message='Hello! How can I help you?', + failure_message='Sorry, something went wrong.', + max_history=20, + ) ) ``` | Parameter | Type | Required | Description | |---|---|---|---| | `name` | `str` | No | Agent display name (used as session name if not overridden) | -| `instructions` | `str` | No | System prompt for the LLM | -| `greeting` | `str` | No | Message spoken when the agent joins | -| `failure_message` | `str` | No | Message spoken on error | -| `max_history` | `int` | No | Maximum conversation history length | +| `instructions` | `str` | No | Deprecated. Use LLM vendor `system_messages` instead. | +| `greeting` | `str` | No | Deprecated. Use LLM/MLLM vendor `greeting_message` instead. | +| `failure_message` | `str` | No | Deprecated. Use LLM/MLLM vendor `failure_message` instead. | +| `max_history` | `int` | No | Deprecated. Use LLM vendor `max_history` instead. | | `turn_detection` | `TurnDetectionConfig` | No | Turn detection settings | | `sal` | `SalConfig` | No | SAL (Speech Activity Level) configuration | | `advanced_features` | `Dict[str, Any]` | No | Advanced features (e.g., `{'enable_rtm': True}`) | @@ -57,15 +60,15 @@ Each `with_*` method returns a **new** `Agent` instance — the original is unch | Method | Accepts | Purpose | |---|---|---| -| `with_instructions(text)` | `str` | Override the system prompt | -| `with_greeting(text)` | `str` | Override the greeting message | +| `with_instructions(text)` | `str` | Deprecated. Use LLM vendor `system_messages` instead. | +| `with_greeting(text)` | `str` | Deprecated. Use LLM/MLLM vendor `greeting_message` instead. | | `with_name(name)` | `str` | Override the agent name | | `with_turn_detection(config)` | `TurnDetectionConfig` | Override cascading-flow SOS/EOS detection; use `with_interruption()` for interruption behavior | | `with_sal(config)` | `SalConfig` | Set SAL configuration | | `with_advanced_features(features)` | `Dict[str, Any]` | Set advanced features | | `with_parameters(parameters)` | `SessionParams` | Set session parameters | -| `with_failure_message(message)` | `str` | Set failure message | -| `with_max_history(max_history)` | `int` | Set max history length | +| `with_failure_message(message)` | `str` | Deprecated. Use LLM/MLLM vendor `failure_message` instead. | +| `with_max_history(max_history)` | `int` | Deprecated. Use LLM vendor `max_history` instead. | | `with_geofence(geofence)` | `GeofenceConfig` | Set geofence configuration | | `with_labels(labels)` | `Dict[str, str]` | Set custom labels | | `with_rtc(rtc)` | `RtcConfig` | Set RTC configuration | @@ -79,8 +82,12 @@ from agora_agent import Agent from agora_agent import OpenAI, ElevenLabsTTS, DeepgramSTT agent = ( - Agent(name='my-agent', instructions='You are a helpful assistant.') - .with_llm(OpenAI(api_key='your-openai-key', model='gpt-4o-mini')) + Agent(name='my-agent') + .with_llm(OpenAI( + api_key='your-openai-key', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}], + )) .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id')) .with_stt(DeepgramSTT(api_key='your-deepgram-key', language='en-US')) ) @@ -97,8 +104,12 @@ from agora_agent import Agent, Agora, Area, OpenAI, ElevenLabsTTS, DeepgramSTT client = Agora(area=Area.US, app_id='your-app-id', app_certificate='your-app-certificate') base = ( - Agent(instructions='You are a helpful assistant.') - .with_llm(OpenAI(api_key='your-openai-key', model='gpt-4o-mini')) + Agent() + .with_llm(OpenAI( + api_key='your-openai-key', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}], + )) .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id')) .with_stt(DeepgramSTT(api_key='your-deepgram-key', language='en-US')) ) diff --git a/docs/concepts/session.md b/docs/concepts/session.md index e4883f2..a513e85 100644 --- a/docs/concepts/session.md +++ b/docs/concepts/session.md @@ -40,8 +40,12 @@ from agora_agent import Agent, Agora, Area, OpenAI, ElevenLabsTTS, DeepgramSTT client = Agora(area=Area.US, app_id='your-app-id', app_certificate='your-app-certificate') agent = ( - Agent(name='my-agent', instructions='You are helpful.') - .with_llm(OpenAI(api_key='your-openai-key', model='gpt-4o-mini')) + Agent(name='my-agent') + .with_llm(OpenAI( + api_key='your-openai-key', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are helpful.'}], + )) .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id')) .with_stt(DeepgramSTT(api_key='your-deepgram-key', language='en-US')) ) diff --git a/docs/getting-started/authentication.md b/docs/getting-started/authentication.md index 74c62cd..a1a87e3 100644 --- a/docs/getting-started/authentication.md +++ b/docs/getting-started/authentication.md @@ -20,9 +20,12 @@ client = Agora( ) agent = ( - Agent(instructions="Be concise.") + Agent() .with_stt(DeepgramSTT(model="nova-3")) - .with_llm(OpenAI(model="gpt-4o-mini")) + .with_llm(OpenAI( + model="gpt-4o-mini", + system_messages=[{"role": "system", "content": "Be concise."}], + )) .with_tts(MiniMaxTTS(model="speech_2_6_turbo", voice_id="English_captivating_female1")) ) diff --git a/docs/getting-started/quick-start.md b/docs/getting-started/quick-start.md index 472ac57..e477920 100644 --- a/docs/getting-started/quick-start.md +++ b/docs/getting-started/quick-start.md @@ -27,14 +27,14 @@ def main() -> None: ) agent = ( - Agent( - name="support-assistant", - instructions="You are a concise support voice assistant.", - greeting="Hello! How can I help you today?", - max_history=10, - ) + Agent(name="support-assistant") .with_stt(DeepgramSTT(model="nova-3", language="en")) - .with_llm(OpenAI(model="gpt-4o-mini")) + .with_llm(OpenAI( + model="gpt-4o-mini", + system_messages=[{"role": "system", "content": "You are a concise support voice assistant."}], + greeting_message="Hello! How can I help you today?", + max_history=10, + )) .with_tts(MiniMaxTTS(model="speech_2_6_turbo", voice_id="English_captivating_female1")) ) diff --git a/docs/guides/agent-builder-features.md b/docs/guides/agent-builder-features.md index 3b55b49..a19c140 100644 --- a/docs/guides/agent-builder-features.md +++ b/docs/guides/agent-builder-features.md @@ -18,8 +18,8 @@ For string values with a finite set of options (e.g. `data_channel`, `sal_mode`, | `advanced_features` | `with_advanced_features(features)` | Enable MLLM, RTM, SAL, tools | | `tools` | `with_tools(enabled=True)` | Enable MCP tool invocation | | `parameters` | `with_parameters(params)` | Silence config, farewell config, data channel | -| `failure_message` | `with_failure_message(msg)` | Message spoken when LLM fails | -| `max_history` | `with_max_history(n)` | Max conversation turns in LLM context | +| `failure_message` | LLM/MLLM vendor option | Message spoken when LLM fails | +| `max_history` | LLM vendor option | Max conversation turns in LLM context | | `geofence` | `with_geofence(config)` | Restrict backend server regions | | `labels` | `with_labels(labels)` | Custom key-value labels (returned in callbacks) | | `rtc` | `with_rtc(config)` | RTC media encryption | @@ -45,14 +45,17 @@ from agora_agent import ( agent = ( Agent( name='sal-assistant', - instructions='You are a helpful assistant.', advanced_features=AdvancedFeatures(enable_sal=True), ) .with_sal(SalConfig( sal_mode=SalModeValues.LOCKING, sample_urls={'primary-speaker': 'https://example.com/voiceprint.pcm'}, )) - .with_llm(OpenAI(api_key='your-key', model='gpt-4o-mini')) + .with_llm(OpenAI( + api_key='your-key', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}], + )) .with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='your-key', model='nova-2', language='en-US')) ) @@ -114,23 +117,14 @@ agent = ( ## Failure Message and Max History ```python -agent = ( - Agent( - name='assistant', - failure_message='Sorry, I encountered an error. Please try again.', - max_history=20, - ) - .with_llm(OpenAI(api_key='...', model='gpt-4o-mini')) - .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', sample_rate=24000)) - .with_stt(DeepgramSTT(api_key='...', model='nova-2')) -) - -# Or via builder methods agent = ( Agent() - .with_failure_message('Something went wrong.') - .with_max_history(15) - .with_llm(OpenAI(api_key='...', model='gpt-4o-mini')) + .with_llm(OpenAI( + api_key='...', + model='gpt-4o-mini', + failure_message='Something went wrong.', + max_history=15, + )) .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='...', model='nova-2')) ) @@ -245,13 +239,12 @@ Read back configuration via properties: from agora_agent import Agent, GeofenceConfig, GeofenceArea agent = ( - Agent(max_history=20) + Agent() .with_geofence(GeofenceConfig(area=GeofenceArea.EUROPE)) .with_labels({'env': 'staging'}) ) agent.name # str | None -agent.max_history # 20 agent.geofence # GeofenceConfig(area='EUROPE') agent.labels # {'env': 'staging'} agent.sal # SalConfig | None @@ -293,14 +286,15 @@ client = Agora( ) agent = ( - Agent( - name='full-featured-assistant', - instructions='You are a helpful voice assistant.', - greeting='Hello! How can I help?', + Agent(name='full-featured-assistant') + .with_llm(OpenAI( + api_key='your-key', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a helpful voice assistant.'}], + greeting_message='Hello! How can I help?', failure_message='Sorry, I had trouble processing that.', max_history=20, - ) - .with_llm(OpenAI(api_key='your-key', model='gpt-4o-mini')) + )) .with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='your-key', model='nova-2', language='en-US')) .with_advanced_features(AdvancedFeatures(enable_rtm=True)) diff --git a/docs/guides/avatars.md b/docs/guides/avatars.md index ca50966..fe74e95 100644 --- a/docs/guides/avatars.md +++ b/docs/guides/avatars.md @@ -54,8 +54,12 @@ client = Agora( ) agent = ( - Agent(name='avatar-agent', instructions='You are a helpful assistant with a visual avatar.') - .with_llm(OpenAI(api_key='your-openai-key', model='gpt-4o-mini')) + Agent(name='avatar-agent') + .with_llm(OpenAI( + api_key='your-openai-key', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a helpful assistant with a visual avatar.'}], + )) .with_tts(ElevenLabsTTS( key='your-elevenlabs-key', model_id='eleven_flash_v2_5', @@ -100,8 +104,12 @@ Akool requires a TTS vendor configured at 16000 Hz: from agora_agent import ElevenLabsTTS, AkoolAvatar agent = ( - Agent(name='akool-agent', instructions='You are a helpful assistant.') - .with_llm(OpenAI(api_key='your-openai-key', model='gpt-4o-mini')) + Agent(name='akool-agent') + .with_llm(OpenAI( + api_key='your-openai-key', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}], + )) .with_tts(ElevenLabsTTS( key='your-elevenlabs-key', model_id='eleven_flash_v2_5', @@ -124,8 +132,12 @@ This example shows what happens when the TTS sample rate does not match the avat ```python # This raises ValueError at build time agent = ( - Agent(name='broken-agent', instructions='You are a helpful assistant.') - .with_llm(OpenAI(api_key='your-openai-key', model='gpt-4o-mini')) + Agent(name='broken-agent') + .with_llm(OpenAI( + api_key='your-openai-key', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}], + )) .with_tts(ElevenLabsTTS( key='your-elevenlabs-key', model_id='eleven_flash_v2_5', @@ -152,7 +164,7 @@ The `with_avatar()` call validates against the currently configured TTS. Always ```python # Correct order: TTS first, then avatar agent = ( - Agent(name='my-agent', instructions='You are helpful.') + Agent(name='my-agent') .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', sample_rate=24000)) .with_avatar(HeyGenAvatar(api_key='your-heygen-key', quality='medium', agora_uid='2')) ) diff --git a/docs/guides/byok.md b/docs/guides/byok.md index 3b03ebe..ad60663 100644 --- a/docs/guides/byok.md +++ b/docs/guides/byok.md @@ -32,12 +32,7 @@ def main() -> None: # In BYOK mode, each vendor carries its own credentials. agent = ( - Agent( - name="support-assistant", - instructions="You are a concise support voice assistant.", - greeting="Hello! How can I help you today?", - max_history=10, - ) + Agent(name="support-assistant") .with_stt( DeepgramSTT( api_key=os.environ["DEEPGRAM_API_KEY"], @@ -49,6 +44,9 @@ def main() -> None: OpenAI( api_key=os.environ["OPENAI_API_KEY"], model="gpt-4o-mini", + system_messages=[{"role": "system", "content": "You are a concise support voice assistant."}], + greeting_message="Hello! How can I help you today?", + max_history=10, ) ) .with_tts( diff --git a/docs/guides/cascading-flow.md b/docs/guides/cascading-flow.md index 43ff2af..d919a48 100644 --- a/docs/guides/cascading-flow.md +++ b/docs/guides/cascading-flow.md @@ -26,8 +26,12 @@ client = Agora( ) agent = ( - Agent(name='assistant', instructions='You are a friendly customer support agent.') - .with_llm(OpenAI(api_key='your-openai-key', model='gpt-4o-mini')) + Agent(name='assistant') + .with_llm(OpenAI( + api_key='your-openai-key', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a friendly customer support agent.'}], + )) .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='your-deepgram-key', language='en-US', model='nova-2')) ) @@ -53,8 +57,12 @@ async def main(): ) agent = ( - Agent(name='assistant', instructions='You are a friendly customer support agent.') - .with_llm(OpenAI(api_key='your-openai-key', model='gpt-4o-mini')) + Agent(name='assistant') + .with_llm(OpenAI( + api_key='your-openai-key', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a friendly customer support agent.'}], + )) .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='your-deepgram-key', language='en-US', model='nova-2')) ) @@ -82,11 +90,12 @@ client = Agora( ) agent = ( - Agent(name='azure-agent', instructions='You are a helpful assistant for enterprise customers.') + Agent(name='azure-agent') .with_llm(AzureOpenAI( api_key='your-azure-key', endpoint='https://your-resource.openai.azure.com', deployment_name='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a helpful assistant for enterprise customers.'}], )) .with_tts(MicrosoftTTS( key='your-azure-speech-key', @@ -125,14 +134,15 @@ llm = OpenAI( ## Adding a Greeting -The `greeting` parameter on `Agent` makes the agent speak automatically when the session starts: +Configure greetings on the LLM vendor so message ownership stays with the LLM configuration: ```python -agent = Agent( - name='greeter', - instructions='You are a helpful assistant.', - greeting='Hi there! What can I do for you?', -) +agent = Agent(name='greeter').with_llm(OpenAI( + api_key='your-openai-key', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}], + greeting_message='Hi there! What can I do for you?', +)) ``` ## Next Steps diff --git a/docs/reference/agent.md b/docs/reference/agent.md index 93770e9..a1205fe 100644 --- a/docs/reference/agent.md +++ b/docs/reference/agent.md @@ -33,20 +33,22 @@ Agent( | Parameter | Type | Default | Description | |---|---|---|---| | `name` | `Optional[str]` | `None` | Agent name, used as default session name | -| `instructions` | `Optional[str]` | `None` | System prompt for the LLM | +| `instructions` | `Optional[str]` | `None` | Deprecated. Use LLM vendor `system_messages` instead. | | `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Turn detection configuration | | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | | `sal` | `Optional[SalConfig]` | `None` | Speech Activity Level configuration | | `advanced_features` | `Optional[Dict[str, Any]]` | `None` | Advanced features dict (e.g., `{'enable_rtm': True}`) | | `parameters` | `Optional[SessionParams]` | `None` | Additional session parameters | -| `greeting` | `Optional[str]` | `None` | Auto-spoken greeting when agent joins | -| `failure_message` | `Optional[str]` | `None` | Spoken on error | -| `max_history` | `Optional[int]` | `None` | Max conversation history length | +| `greeting` | `Optional[str]` | `None` | Deprecated. Use LLM/MLLM vendor `greeting_message` instead. | +| `failure_message` | `Optional[str]` | `None` | Deprecated. Use LLM/MLLM vendor `failure_message` instead. | +| `max_history` | `Optional[int]` | `None` | Deprecated. Use LLM vendor `max_history` instead. | | `geofence` | `Optional[GeofenceConfig]` | `None` | Regional access restriction | | `labels` | `Optional[Dict[str, str]]` | `None` | Custom key-value labels (returned in callbacks) | | `rtc` | `Optional[RtcConfig]` | `None` | RTC media encryption | | `filler_words` | `Optional[FillerWordsConfig]` | `None` | Filler words while waiting for LLM | +The Agent-level `instructions`, `greeting`, `failure_message`, `max_history`, and `greeting_configs` fields are compatibility shims. New code should configure those values on the LLM or MLLM vendor because that matches the core request schema. + ## Builder Methods All builder methods return a new `Agent` instance (immutable pattern). @@ -131,11 +133,11 @@ Configure unified interruption behavior using the top-level `interruption` objec ### `with_instructions(instructions: str) -> Agent` -Override the system prompt. +Deprecated. Configure `system_messages` on the LLM vendor instead. ### `with_greeting(greeting: str) -> Agent` -Override the greeting message. +Deprecated. Configure `greeting_message` on the LLM or MLLM vendor instead. ### `with_name(name: str) -> Agent` @@ -165,11 +167,11 @@ Set `parameters.audio_scenario` without replacing existing session parameters. ### `with_failure_message(message: str) -> Agent` -Set the message spoken via TTS when the LLM call fails. +Deprecated. Configure `failure_message` on the LLM or MLLM vendor instead. ### `with_max_history(max_history: int) -> Agent` -Set the maximum conversation history length for the standard ASR + LLM + TTS pipeline. The v2.7 MLLM core type does not expose `max_history`. +Deprecated. Configure `max_history` on the LLM vendor instead. ### `with_geofence(geofence: GeofenceConfig) -> Agent` @@ -246,10 +248,10 @@ to_properties( | Property | Type | Description | |---|---|---| | `name` | `Optional[str]` | Agent name | -| `instructions` | `Optional[str]` | System prompt | -| `greeting` | `Optional[str]` | Greeting message | -| `failure_message` | `Optional[str]` | Message spoken when LLM fails | -| `max_history` | `Optional[int]` | Max conversation history length | +| `instructions` | `Optional[str]` | Deprecated Agent-level system prompt | +| `greeting` | `Optional[str]` | Deprecated Agent-level greeting message | +| `failure_message` | `Optional[str]` | Deprecated Agent-level failure message | +| `max_history` | `Optional[int]` | Deprecated Agent-level max history | | `llm` | `Optional[Dict[str, Any]]` | LLM config dict (from `to_config()`) | | `tts` | `Optional[Dict[str, Any]]` | TTS config dict | | `stt` | `Optional[Dict[str, Any]]` | STT config dict | From f652c69edbd1815c832fc9354c193090ac8dde8e Mon Sep 17 00:00:00 2001 From: fern-api <115122769+fern-api[bot]@users.noreply.github.com> Date: Tue, 2 Jun 2026 00:10:16 +0000 Subject: [PATCH 04/26] [fern-generated] Update SDK Generated by Fern CLI Version: unknown Generators: - fernapi/fern-python-sdk: 4.37.0 --- reference.md | 21 ++- src/agora_agent/agents/client.py | 42 +++-- .../types/start_agents_request_properties.py | 12 +- .../start_agents_request_properties_asr.py | 47 ----- ...rt_agents_request_properties_asr_vendor.py | 10 - .../start_agents_request_properties_llm.py | 115 ------------ ...request_properties_llm_greeting_configs.py | 43 ----- ...st_properties_llm_greeting_configs_mode.py | 7 - ...request_properties_llm_mcp_servers_item.py | 54 ------ ...art_agents_request_properties_llm_style.py | 5 - .../start_agents_request_properties_mllm.py | 86 --------- ..._request_properties_mllm_turn_detection.py | 61 ------- ...es_mllm_turn_detection_agora_vad_config.py | 42 ----- ...est_properties_mllm_turn_detection_mode.py | 7 - ...mllm_turn_detection_semantic_vad_config.py | 32 ---- ...detection_semantic_vad_config_eagerness.py | 7 - ...s_mllm_turn_detection_server_vad_config.py | 62 ------- ...t_agents_request_properties_mllm_vendor.py | 5 - src/agora_agent/core/client_wrapper.py | 4 +- src/agora_agent/types/amazon_asr.py | 27 +++ src/agora_agent/types/amazon_asr_params.py | 52 ++++++ src/agora_agent/types/amazon_tts_params.py | 16 +- .../types/amazon_tts_params_engine.py | 5 + src/agora_agent/types/ares_asr.py | 27 +++ src/agora_agent/types/ares_asr_params.py | 5 + src/agora_agent/types/asr.py | 172 ++++++++++++++++++ src/agora_agent/types/asr_language.py | 41 +++++ src/agora_agent/types/assembly_ai_asr.py | 27 +++ .../types/assembly_ai_asr_params.py | 37 ++++ .../types/cartesia_tts_output_format.py | 32 ++++ src/agora_agent/types/cartesia_tts_params.py | 17 +- src/agora_agent/types/deepgram_asr.py | 31 ++++ src/agora_agent/types/deepgram_asr_params.py | 47 +++++ .../types/eleven_labs_tts_params.py | 25 +++ .../types/fish_audio_tts_params.py | 7 +- src/agora_agent/types/google_asr.py | 27 +++ src/agora_agent/types/google_asr_params.py | 47 +++++ .../types/google_tts_audio_config.py | 32 ++++ src/agora_agent/types/google_tts_params.py | 28 ++- .../google_tts_voice_selection_params.py | 27 +++ src/agora_agent/types/hume_ai_tts_params.py | 28 ++- .../types/hume_ai_tts_params_provider.py | 5 + src/agora_agent/types/llm.py | 120 ++++++++++++ src/agora_agent/types/llm_params.py | 32 ++++ src/agora_agent/types/llm_style.py | 5 + src/agora_agent/types/microsoft_asr.py | 27 +++ src/agora_agent/types/microsoft_asr_params.py | 42 +++++ src/agora_agent/types/microsoft_tts_params.py | 10 + src/agora_agent/types/mllm.py | 88 +++++++++ src/agora_agent/types/mllm_http_options.py | 27 +++ .../types/mllm_input_audio_transcription.py | 37 ++++ src/agora_agent/types/mllm_params.py | 71 ++++++++ src/agora_agent/types/mllm_turn_detection.py | 35 ++++ .../mllm_turn_detection_agora_vad_config.py | 23 +++ .../types/mllm_turn_detection_mode.py | 5 + ...mllm_turn_detection_semantic_vad_config.py | 21 +++ ...detection_semantic_vad_config_eagerness.py | 5 + .../mllm_turn_detection_server_vad_config.py | 31 ++++ ...r_vad_config_end_of_speech_sensitivity.py} | 2 +- ...vad_config_start_of_speech_sensitivity.py} | 2 +- src/agora_agent/types/mllm_vendor.py | 5 + src/agora_agent/types/murf_tts_params.py | 37 +++- src/agora_agent/types/open_ai_asr.py | 27 +++ src/agora_agent/types/open_ai_asr_params.py | 30 +++ .../open_ai_input_audio_transcription.py | 37 ++++ src/agora_agent/types/open_ai_tts_params.py | 17 +- src/agora_agent/types/rime_tts_params.py | 15 +- src/agora_agent/types/sarvam_asr.py | 27 +++ src/agora_agent/types/sarvam_asr_params.py | 32 ++++ src/agora_agent/types/sarvam_tts_params.py | 25 ++- .../sarvam_tts_params_target_language_code.py | 8 + src/agora_agent/types/speechmatics_asr.py | 27 +++ .../types/speechmatics_asr_params.py | 37 ++++ tests/custom/test_avatar_token.py | 12 -- tests/custom/test_llm_vendors.py | 60 ------ tests/custom/test_root_exports.py | 29 --- 76 files changed, 1675 insertions(+), 757 deletions(-) delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_asr.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_asr_vendor.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_llm.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs_mode.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_llm_mcp_servers_item.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_llm_style.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_agora_vad_config.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_mode.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py create mode 100644 src/agora_agent/types/amazon_asr.py create mode 100644 src/agora_agent/types/amazon_asr_params.py create mode 100644 src/agora_agent/types/amazon_tts_params_engine.py create mode 100644 src/agora_agent/types/ares_asr.py create mode 100644 src/agora_agent/types/ares_asr_params.py create mode 100644 src/agora_agent/types/asr.py create mode 100644 src/agora_agent/types/asr_language.py create mode 100644 src/agora_agent/types/assembly_ai_asr.py create mode 100644 src/agora_agent/types/assembly_ai_asr_params.py create mode 100644 src/agora_agent/types/cartesia_tts_output_format.py create mode 100644 src/agora_agent/types/deepgram_asr.py create mode 100644 src/agora_agent/types/deepgram_asr_params.py create mode 100644 src/agora_agent/types/google_asr.py create mode 100644 src/agora_agent/types/google_asr_params.py create mode 100644 src/agora_agent/types/google_tts_audio_config.py create mode 100644 src/agora_agent/types/google_tts_voice_selection_params.py create mode 100644 src/agora_agent/types/hume_ai_tts_params_provider.py create mode 100644 src/agora_agent/types/llm.py create mode 100644 src/agora_agent/types/llm_params.py create mode 100644 src/agora_agent/types/llm_style.py create mode 100644 src/agora_agent/types/microsoft_asr.py create mode 100644 src/agora_agent/types/microsoft_asr_params.py create mode 100644 src/agora_agent/types/mllm.py create mode 100644 src/agora_agent/types/mllm_http_options.py create mode 100644 src/agora_agent/types/mllm_input_audio_transcription.py create mode 100644 src/agora_agent/types/mllm_params.py create mode 100644 src/agora_agent/types/mllm_turn_detection.py create mode 100644 src/agora_agent/types/mllm_turn_detection_agora_vad_config.py create mode 100644 src/agora_agent/types/mllm_turn_detection_mode.py create mode 100644 src/agora_agent/types/mllm_turn_detection_semantic_vad_config.py create mode 100644 src/agora_agent/types/mllm_turn_detection_semantic_vad_config_eagerness.py create mode 100644 src/agora_agent/types/mllm_turn_detection_server_vad_config.py rename src/agora_agent/{agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py => types/mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py} (61%) rename src/agora_agent/{agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py => types/mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py} (61%) create mode 100644 src/agora_agent/types/mllm_vendor.py create mode 100644 src/agora_agent/types/open_ai_asr.py create mode 100644 src/agora_agent/types/open_ai_asr_params.py create mode 100644 src/agora_agent/types/open_ai_input_audio_transcription.py create mode 100644 src/agora_agent/types/sarvam_asr.py create mode 100644 src/agora_agent/types/sarvam_asr_params.py create mode 100644 src/agora_agent/types/sarvam_tts_params_target_language_code.py create mode 100644 src/agora_agent/types/speechmatics_asr.py create mode 100644 src/agora_agent/types/speechmatics_asr_params.py delete mode 100644 tests/custom/test_avatar_token.py delete mode 100644 tests/custom/test_llm_vendors.py delete mode 100644 tests/custom/test_root_exports.py diff --git a/reference.md b/reference.md index 55a516e..57fc92a 100644 --- a/reference.md +++ b/reference.md @@ -27,11 +27,16 @@ Create and start a Conversational AI agent instance.
```python -from agora_agent import Agora, MicrosoftTtsParams, Tts_Microsoft +from agora_agent import ( + Agora, + Asr_Ares, + Llm, + LlmParams, + MicrosoftTtsParams, + Tts_Microsoft, +) from agora_agent.agents import ( StartAgentsRequestProperties, - StartAgentsRequestPropertiesAsr, - StartAgentsRequestPropertiesLlm, StartAgentsRequestPropertiesTurnDetection, StartAgentsRequestPropertiesTurnDetectionConfig, StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, @@ -51,9 +56,7 @@ client.agents.start( agent_rtc_uid="1001", remote_rtc_uids=["1002"], idle_timeout=120, - asr=StartAgentsRequestPropertiesAsr( - language="en-US", - ), + asr=Asr_Ares(), tts=Tts_Microsoft( params=MicrosoftTtsParams( key="key", @@ -61,13 +64,15 @@ client.agents.start( voice_name="voice_name", ), ), - llm=StartAgentsRequestPropertiesLlm( + llm=Llm( url="https://api.openai.com/v1/chat/completions", api_key="", system_messages=[ {"role": "system", "content": "You are a helpful chatbot."} ], - params={"model": "gpt-4o-mini"}, + params=LlmParams( + model="gpt-4o-mini", + ), max_history=32, greeting_message="Hello, how can I assist you today?", failure_message="Please hold on a second.", diff --git a/src/agora_agent/agents/client.py b/src/agora_agent/agents/client.py index 3f6af4c..e923c9a 100644 --- a/src/agora_agent/agents/client.py +++ b/src/agora_agent/agents/client.py @@ -84,11 +84,16 @@ def start( Examples -------- - from agora_agent import Agora, MicrosoftTtsParams, Tts_Microsoft + from agora_agent import ( + Agora, + Asr_Ares, + Llm, + LlmParams, + MicrosoftTtsParams, + Tts_Microsoft, + ) from agora_agent.agents import ( StartAgentsRequestProperties, - StartAgentsRequestPropertiesAsr, - StartAgentsRequestPropertiesLlm, StartAgentsRequestPropertiesTurnDetection, StartAgentsRequestPropertiesTurnDetectionConfig, StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, @@ -108,9 +113,7 @@ def start( agent_rtc_uid="1001", remote_rtc_uids=["1002"], idle_timeout=120, - asr=StartAgentsRequestPropertiesAsr( - language="en-US", - ), + asr=Asr_Ares(), tts=Tts_Microsoft( params=MicrosoftTtsParams( key="key", @@ -118,13 +121,15 @@ def start( voice_name="voice_name", ), ), - llm=StartAgentsRequestPropertiesLlm( + llm=Llm( url="https://api.openai.com/v1/chat/completions", api_key="", system_messages=[ {"role": "system", "content": "You are a helpful chatbot."} ], - params={"model": "gpt-4o-mini"}, + params=LlmParams( + model="gpt-4o-mini", + ), max_history=32, greeting_message="Hello, how can I assist you today?", failure_message="Please hold on a second.", @@ -641,11 +646,16 @@ async def start( -------- import asyncio - from agora_agent import AsyncAgora, MicrosoftTtsParams, Tts_Microsoft + from agora_agent import ( + Asr_Ares, + AsyncAgora, + Llm, + LlmParams, + MicrosoftTtsParams, + Tts_Microsoft, + ) from agora_agent.agents import ( StartAgentsRequestProperties, - StartAgentsRequestPropertiesAsr, - StartAgentsRequestPropertiesLlm, StartAgentsRequestPropertiesTurnDetection, StartAgentsRequestPropertiesTurnDetectionConfig, StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, @@ -668,9 +678,7 @@ async def main() -> None: agent_rtc_uid="1001", remote_rtc_uids=["1002"], idle_timeout=120, - asr=StartAgentsRequestPropertiesAsr( - language="en-US", - ), + asr=Asr_Ares(), tts=Tts_Microsoft( params=MicrosoftTtsParams( key="key", @@ -678,13 +686,15 @@ async def main() -> None: voice_name="voice_name", ), ), - llm=StartAgentsRequestPropertiesLlm( + llm=Llm( url="https://api.openai.com/v1/chat/completions", api_key="", system_messages=[ {"role": "system", "content": "You are a helpful chatbot."} ], - params={"model": "gpt-4o-mini"}, + params=LlmParams( + model="gpt-4o-mini", + ), max_history=32, greeting_message="Hello, how can I assist you today?", failure_message="Please hold on a second.", diff --git a/src/agora_agent/agents/types/start_agents_request_properties.py b/src/agora_agent/agents/types/start_agents_request_properties.py index 06c3482..3cddb7e 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties.py +++ b/src/agora_agent/agents/types/start_agents_request_properties.py @@ -5,15 +5,15 @@ import pydantic from ...core.pydantic_utilities import IS_PYDANTIC_V2 from ...core.unchecked_base_model import UncheckedBaseModel +from ...types.asr import Asr +from ...types.llm import Llm +from ...types.mllm import Mllm from ...types.tts import Tts from .start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures -from .start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr from .start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar from .start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords from .start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence from .start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption -from .start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm -from .start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm from .start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters from .start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc from .start_agents_request_properties_sal import StartAgentsRequestPropertiesSal @@ -67,7 +67,7 @@ class StartAgentsRequestProperties(UncheckedBaseModel): Advanced features configuration. """ - asr: typing.Optional[StartAgentsRequestPropertiesAsr] = pydantic.Field(default=None) + asr: typing.Optional[Asr] = pydantic.Field(default=None) """ Automatic Speech Recognition (ASR) configuration. """ @@ -77,12 +77,12 @@ class StartAgentsRequestProperties(UncheckedBaseModel): Text-to-speech (TTS) module configuration. """ - llm: typing.Optional[StartAgentsRequestPropertiesLlm] = pydantic.Field(default=None) + llm: typing.Optional[Llm] = pydantic.Field(default=None) """ Large language model (LLM) configuration. """ - mllm: typing.Optional[StartAgentsRequestPropertiesMllm] = pydantic.Field(default=None) + mllm: typing.Optional[Mllm] = pydantic.Field(default=None) """ Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing. `mllm` is an exclusive alternative to the standard `asr` + `llm` + `tts` pipeline. """ diff --git a/src/agora_agent/agents/types/start_agents_request_properties_asr.py b/src/agora_agent/agents/types/start_agents_request_properties_asr.py deleted file mode 100644 index 7385e17..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_asr.py +++ /dev/null @@ -1,47 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor - - -class StartAgentsRequestPropertiesAsr(UncheckedBaseModel): - """ - Automatic Speech Recognition (ASR) configuration. - """ - - language: typing.Optional[str] = pydantic.Field(default=None) - """ - The BCP-47 language tag identifying the primary language used for agent interaction. If `params` contains a vendor-specific language code, it takes precedence over this setting. - """ - - vendor: typing.Optional[StartAgentsRequestPropertiesAsrVendor] = pydantic.Field(default=None) - """ - ASR provider: - - `ares`: Adaptive Recognition Engine for Speech - - `microsoft`: Microsoft Azure - - `deepgram`: Deepgram - - `openai`: OpenAI (Beta) - - `speechmatics`: Speechmatics - - `assemblyai`: AssemblyAI (Beta) - - `amazon`: Amazon Transcribe (Beta) - - `google`: Google (Beta) - - `sarvam`: Sarvam (Beta) - """ - - params: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) - """ - The configuration parameters for the ASR vendor. See [ASR Overview](https://docs.agora.io/en/conversational-ai/models/asr/overview) for details. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_asr_vendor.py b/src/agora_agent/agents/types/start_agents_request_properties_asr_vendor.py deleted file mode 100644 index 973d62c..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_asr_vendor.py +++ /dev/null @@ -1,10 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesAsrVendor = typing.Union[ - typing.Literal[ - "ares", "microsoft", "deepgram", "openai", "google", "amazon", "assemblyai", "speechmatics", "sarvam" - ], - typing.Any, -] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm.py b/src/agora_agent/agents/types/start_agents_request_properties_llm.py deleted file mode 100644 index 9ab0f62..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm.py +++ /dev/null @@ -1,115 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs -from .start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem -from .start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle - - -class StartAgentsRequestPropertiesLlm(UncheckedBaseModel): - """ - Large language model (LLM) configuration. - """ - - url: str = pydantic.Field() - """ - The LLM callback address. - """ - - api_key: typing.Optional[str] = pydantic.Field(default=None) - """ - The LLM verification API key. The default value is an empty string. Ensure that you enable the API key in a production environment. - """ - - system_messages: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) - """ - A set of predefined information used as input to the LLM, including prompt words and examples. - """ - - params: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) - """ - Additional LLM configuration parameters, such as the `model` used, and the maximum token limit. For details about each supported LLM, refer to [Supported LLMs](https://docs.agora.io/en/conversational-ai/models/llm/overview#supported-llms). - """ - - max_history: typing.Optional[int] = pydantic.Field(default=None) - """ - The number of conversation history messages cached in the custom LLM. History includes user and agent dialog messages, tool call information, and timestamps. Agent and user messages are recorded separately. - """ - - input_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - LLM input modalities: - - `["text"]`: Text only - - `["text", "image"]`: Text plus image. Recommended configuration, requires the selected LLM to support visual input - """ - - output_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - LLM output modalities: - - `["text"]`: The output text is converted to speech by the TTS module and then published to the RTC channel. - - `["audio"]`: Voice only. Voice is published directly to the RTC channel. - - `["text", "audio"]`: Text plus voice. Write your own logic to process the output of LLM as needed. - """ - - greeting_message: typing.Optional[str] = pydantic.Field(default=None) - """ - Agent greeting. If provided, the first user in the channel is automatically greeted with the message upon joining. - """ - - failure_message: typing.Optional[str] = pydantic.Field(default=None) - """ - Prompt for agent activation failure. If provided, it is returned through TTS when the custom LLM call fails. - """ - - vendor: typing.Optional[str] = pydantic.Field(default=None) - """ - LLM provider, supports the following settings: - - `custom`: Custom LLM. When you set this option, the agent includes the following fields, in addition to `role` and `content` when making requests to the custom LLM: - - `turn_id`: A unique identifier for each conversation turn. It starts from `0` and increments with each turn. One user-agent interaction corresponds to one `turn_id`. - - `timestamp`: The request timestamp, in milliseconds. - - `azure`: Use this value for Azure OpenAI - """ - - style: typing.Optional[StartAgentsRequestPropertiesLlmStyle] = pydantic.Field(default=None) - """ - The request style for chat completion: - - `openai`: For OpenAI and OpenAI-compatible APIs - - `gemini`: For Google Gemini and Google Vertex API format - - `anthropic`: For Anthropic Claude API format - - `dify`: For Dify API format - """ - - greeting_configs: typing.Optional[StartAgentsRequestPropertiesLlmGreetingConfigs] = pydantic.Field(default=None) - """ - Agent greeting broadcast configuration. - """ - - template_variables: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) - """ - Template parameter configuration used to insert variables into the agent's `system_messages`, `greeting_message`, `failure_message`, and `parameters.silence_config.content` text. Uses key-value pairs, where the key is the variable name and the value is the variable's value. To insert defined variables in the prompt text, use the syntax `{{variable_name}}`. The system automatically replaces each variable with the corresponding value defined in `template_variables`. Variable values cannot reference other variables. - """ - - mcp_servers: typing.Optional[typing.List[StartAgentsRequestPropertiesLlmMcpServersItem]] = pydantic.Field( - default=None - ) - """ - MCP (Model Context Protocol) server configuration. By configuring MCP servers, agents can call tools provided by external services to implement advanced functionality. - """ - - headers: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) - """ - Custom headers to include in requests to the LLM. Use this field to pass business-specific information such as custom fields or tenant identifiers. These headers are merged with the headers generated by the Conversational AI Engine. If a key conflict occurs, the engine-generated header takes precedence. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py b/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py deleted file mode 100644 index c0d7046..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py +++ /dev/null @@ -1,43 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_llm_greeting_configs_mode import ( - StartAgentsRequestPropertiesLlmGreetingConfigsMode, -) - - -class StartAgentsRequestPropertiesLlmGreetingConfigs(UncheckedBaseModel): - """ - Agent greeting broadcast configuration. - """ - - mode: typing.Optional[StartAgentsRequestPropertiesLlmGreetingConfigsMode] = pydantic.Field(default=None) - """ - Determines when the agent sends greeting messages to users joining the channel. - - `single_every`: Broadcasts a greeting every time a user joins the channel. - - `single_first`: Broadcasts a greeting only once to the first user who joins the channel. - """ - - delay_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - The delay in milliseconds before the agent plays the greeting message after a user joins the channel. - """ - - interruptable: typing.Optional[bool] = pydantic.Field(default=None) - """ - - `true`: Follows the global `interruption` configuration. - - `false`: Uninterruptible. The greeting plays in its entirety. If the user speaks multiple times while the greeting plays, the system merges the speech segments after the greeting ends and sends them to the LLM for a single response. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs_mode.py b/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs_mode.py deleted file mode 100644 index 44e4a55..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs_mode.py +++ /dev/null @@ -1,7 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesLlmGreetingConfigsMode = typing.Union[ - typing.Literal["single_every", "single_first"], typing.Any -] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm_mcp_servers_item.py b/src/agora_agent/agents/types/start_agents_request_properties_llm_mcp_servers_item.py deleted file mode 100644 index 0474072..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm_mcp_servers_item.py +++ /dev/null @@ -1,54 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel - - -class StartAgentsRequestPropertiesLlmMcpServersItem(UncheckedBaseModel): - name: str = pydantic.Field() - """ - A unique identifier for the MCP server. Maximum 48 characters. Accepts only English letters and numbers. - """ - - endpoint: str = pydantic.Field() - """ - The endpoint address of the MCP server. The agent uses this to communicate with the MCP server. - """ - - transport: typing.Optional[typing.Literal["streamable_http"]] = pydantic.Field(default=None) - """ - Transport protocol type. - - `streamable_http`: Streaming HTTP protocol - """ - - headers: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) - """ - HTTP header information to include when requesting the MCP server, such as authentication information. - """ - - allowed_tools: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - A list of tools that the agent is allowed to invoke. The agent can only use tools on this list. - - Empty or omitted: All tools are enabled. - - Empty array `[]`: No tools are enabled. - - `["*"]`: All tools are enabled. - - Specific tools `["aa", "bb"]`: Only listed tools are enabled. - - Mix with wildcard `["aa", "*"]`: All tools are enabled (wildcard takes precedence). - """ - - timeout_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - The MCP server request timeout in milliseconds. After timeout, the agent stops waiting for the MCP server's response and continues executing subsequent logic. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm_style.py b/src/agora_agent/agents/types/start_agents_request_properties_llm_style.py deleted file mode 100644 index eaa9a0d..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm_style.py +++ /dev/null @@ -1,5 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesLlmStyle = typing.Union[typing.Literal["openai", "gemini", "anthropic", "dify"], typing.Any] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm.py deleted file mode 100644 index 0993ebc..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm.py +++ /dev/null @@ -1,86 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection -from .start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor - - -class StartAgentsRequestPropertiesMllm(UncheckedBaseModel): - """ - Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing. `mllm` is an exclusive alternative to the standard `asr` + `llm` + `tts` pipeline. - """ - - enable: typing.Optional[bool] = pydantic.Field(default=None) - """ - Enable Multimodal Large Language Model for voice-to-voice processing. Enabling MLLM automatically disables ASR, LLM, and TTS since the MLLM handles end-to-end voice processing directly. Replaces the deprecated `advanced_features.enable_mllm`. - """ - - url: typing.Optional[str] = pydantic.Field(default=None) - """ - The MLLM WebSocket URL for real-time communication. - """ - - api_key: typing.Optional[str] = pydantic.Field(default=None) - """ - The API key used for MLLM authentication. - """ - - messages: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) - """ - Array of conversation items used for short-term memory management. Uses the same structure as `item.content` from the OpenAI Realtime API. - """ - - params: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) - """ - Additional MLLM configuration parameters. The `modalities` setting is overridden by `input_modalities` and `output_modalities`. The `turn_detection` setting is overridden by `mllm.turn_detection`. - """ - - input_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - MLLM input modalities: - - `["audio"]`: Audio only - - `["audio", "text"]`: Audio plus text - """ - - output_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - MLLM output modalities: - - `["text", "audio"]`: Text plus audio - """ - - greeting_message: typing.Optional[str] = pydantic.Field(default=None) - """ - Agent greeting message. If provided, the first user in the channel is automatically greeted with this message upon joining. - """ - - failure_message: typing.Optional[str] = pydantic.Field(default=None) - """ - Agent failure message. If provided, the agent speaks this message when an MLLM request fails. - """ - - vendor: typing.Optional[StartAgentsRequestPropertiesMllmVendor] = pydantic.Field(default=None) - """ - MLLM provider. Currently supports: - - `openai`: OpenAI Realtime API - - `gemini`: Google Gemini Live - - `vertexai`: Google Gemini Live (Vertex AI) - - `xai`: xAI Grok Realtime API - """ - - turn_detection: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetection] = pydantic.Field(default=None) - """ - Turn detection configuration for the MLLM module. When defined, the top-level `turn_detection` object has no effect. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py deleted file mode 100644 index 032979d..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py +++ /dev/null @@ -1,61 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_mllm_turn_detection_agora_vad_config import ( - StartAgentsRequestPropertiesMllmTurnDetectionAgoraVadConfig, -) -from .start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode -from .start_agents_request_properties_mllm_turn_detection_semantic_vad_config import ( - StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfig, -) -from .start_agents_request_properties_mllm_turn_detection_server_vad_config import ( - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfig, -) - - -class StartAgentsRequestPropertiesMllmTurnDetection(UncheckedBaseModel): - """ - Turn detection configuration for the MLLM module. When defined, the top-level `turn_detection` object has no effect. - """ - - mode: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionMode] = pydantic.Field(default=None) - """ - Turn detection mode for MLLM: - - `agora_vad`: Agora VAD-based detection. - - `server_vad`: Vendor-side VAD-based detection. Supported by OpenAI Realtime API, Gemini Live, and xAI Grok. - - `semantic_vad`: Semantic-based detection. Supported by OpenAI Realtime API only. - """ - - agora_vad_config: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionAgoraVadConfig] = pydantic.Field( - default=None - ) - """ - Configuration for Agora VAD-based turn detection. Applicable when `mode` is `agora_vad`. - """ - - server_vad_config: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfig] = pydantic.Field( - default=None - ) - """ - Configuration for vendor-side VAD-based turn detection. Applicable when `mode` is `server_vad`. Parameters are passed through to the vendor. - """ - - semantic_vad_config: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfig] = ( - pydantic.Field(default=None) - ) - """ - Configuration for semantic-based turn detection. Applicable when `mode` is `semantic_vad`. Supported by OpenAI Realtime API only. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_agora_vad_config.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_agora_vad_config.py deleted file mode 100644 index ec30215..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_agora_vad_config.py +++ /dev/null @@ -1,42 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel - - -class StartAgentsRequestPropertiesMllmTurnDetectionAgoraVadConfig(UncheckedBaseModel): - """ - Configuration for Agora VAD-based turn detection. Applicable when `mode` is `agora_vad`. - """ - - interrupt_duration_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Minimum duration of speech in milliseconds required to trigger an interruption. - """ - - prefix_padding_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Duration of audio in milliseconds to include before the detected speech start. - """ - - silence_duration_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Duration of silence in milliseconds required to determine end of speech. - """ - - threshold: typing.Optional[float] = pydantic.Field(default=None) - """ - VAD sensitivity threshold. A higher value reduces false positives. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_mode.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_mode.py deleted file mode 100644 index 0d004e8..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_mode.py +++ /dev/null @@ -1,7 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesMllmTurnDetectionMode = typing.Union[ - typing.Literal["agora_vad", "server_vad", "semantic_vad"], typing.Any -] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config.py deleted file mode 100644 index 1e310f0..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config.py +++ /dev/null @@ -1,32 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness import ( - StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfigEagerness, -) - - -class StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfig(UncheckedBaseModel): - """ - Configuration for semantic-based turn detection. Applicable when `mode` is `semantic_vad`. Supported by OpenAI Realtime API only. - """ - - eagerness: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfigEagerness] = ( - pydantic.Field(default=None) - ) - """ - Controls how eagerly the model ends its turn. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness.py deleted file mode 100644 index 8b67b1d..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness.py +++ /dev/null @@ -1,7 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfigEagerness = typing.Union[ - typing.Literal["auto", "low", "medium", "high"], typing.Any -] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py deleted file mode 100644 index c74d8d7..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py +++ /dev/null @@ -1,62 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity import ( - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigEndOfSpeechSensitivity, -) -from .start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity import ( - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigStartOfSpeechSensitivity, -) - - -class StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfig(UncheckedBaseModel): - """ - Configuration for vendor-side VAD-based turn detection. Applicable when `mode` is `server_vad`. Parameters are passed through to the vendor. - """ - - prefix_padding_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Duration of audio in milliseconds to include before the detected speech start. - """ - - silence_duration_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Duration of silence in milliseconds required to determine end of speech. - """ - - threshold: typing.Optional[float] = pydantic.Field(default=None) - """ - VAD sensitivity threshold. Applicable to OpenAI Realtime API and xAI Grok. - """ - - idle_timeout_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Idle timeout in milliseconds. Applicable to OpenAI Realtime API only. - """ - - start_of_speech_sensitivity: typing.Optional[ - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigStartOfSpeechSensitivity - ] = pydantic.Field(default=None) - """ - Sensitivity for start of speech detection. Applicable to Gemini Live only. - """ - - end_of_speech_sensitivity: typing.Optional[ - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigEndOfSpeechSensitivity - ] = pydantic.Field(default=None) - """ - Sensitivity for end of speech detection. Applicable to Gemini Live only. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py deleted file mode 100644 index 0233696..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py +++ /dev/null @@ -1,5 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesMllmVendor = typing.Union[typing.Literal["openai", "gemini", "vertexai", "xai"], typing.Any] diff --git a/src/agora_agent/core/client_wrapper.py b/src/agora_agent/core/client_wrapper.py index c44e886..acd9073 100644 --- a/src/agora_agent/core/client_wrapper.py +++ b/src/agora_agent/core/client_wrapper.py @@ -26,10 +26,10 @@ def __init__( def get_headers(self) -> typing.Dict[str, str]: headers: typing.Dict[str, str] = { - "User-Agent": "agora-agents/v2.0.0", + "User-Agent": "agora-agents/v2.1.0", "X-Fern-Language": "Python", "X-Fern-SDK-Name": "agora-agents", - "X-Fern-SDK-Version": "v2.0.0", + "X-Fern-SDK-Version": "v2.1.0", **(self.get_custom_headers() or {}), } headers["Authorization"] = httpx.BasicAuth(self._get_username(), self._get_password())._auth_header diff --git a/src/agora_agent/types/amazon_asr.py b/src/agora_agent/types/amazon_asr.py new file mode 100644 index 0000000..4054518 --- /dev/null +++ b/src/agora_agent/types/amazon_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .amazon_asr_params import AmazonAsrParams +from .asr_language import AsrLanguage + + +class AmazonAsr(UncheckedBaseModel): + """ + Amazon Transcribe ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: AmazonAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/amazon_asr_params.py b/src/agora_agent/types/amazon_asr_params.py new file mode 100644 index 0000000..1d30688 --- /dev/null +++ b/src/agora_agent/types/amazon_asr_params.py @@ -0,0 +1,52 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class AmazonAsrParams(UncheckedBaseModel): + """ + Amazon Transcribe ASR configuration parameters. + """ + + region: str = pydantic.Field() + """ + AWS region + """ + + access_key_id: str = pydantic.Field() + """ + AWS access key ID + """ + + secret_access_key: str = pydantic.Field() + """ + AWS secret access key + """ + + language_code: str = pydantic.Field() + """ + Language code for speech recognition + """ + + media_sample_rate_hz: typing.Optional[int] = pydantic.Field(default=None) + """ + Sample rate in Hertz for the audio input + """ + + media_encoding: typing.Optional[str] = pydantic.Field(default=None) + """ + Encoding format of the audio input + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/amazon_tts_params.py b/src/agora_agent/types/amazon_tts_params.py index baaa6fa..7995911 100644 --- a/src/agora_agent/types/amazon_tts_params.py +++ b/src/agora_agent/types/amazon_tts_params.py @@ -5,6 +5,7 @@ import pydantic from ..core.pydantic_utilities import IS_PYDANTIC_V2 from ..core.unchecked_base_model import UncheckedBaseModel +from .amazon_tts_params_engine import AmazonTtsParamsEngine class AmazonTtsParams(UncheckedBaseModel): @@ -12,26 +13,31 @@ class AmazonTtsParams(UncheckedBaseModel): Amazon Polly TTS configuration parameters. """ - access_key: str = pydantic.Field() + aws_access_key_id: str = pydantic.Field() """ - AWS access key + AWS access key ID """ - secret_key: str = pydantic.Field() + aws_secret_access_key: str = pydantic.Field() """ AWS secret key """ - region: str = pydantic.Field() + region_name: str = pydantic.Field() """ AWS region (e.g., "us-east-1") """ - voice_id: str = pydantic.Field() + voice: str = pydantic.Field() """ Amazon Polly voice ID """ + engine: typing.Optional[AmazonTtsParamsEngine] = pydantic.Field(default=None) + """ + Amazon Polly engine type + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/amazon_tts_params_engine.py b/src/agora_agent/types/amazon_tts_params_engine.py new file mode 100644 index 0000000..d9e3cfe --- /dev/null +++ b/src/agora_agent/types/amazon_tts_params_engine.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +AmazonTtsParamsEngine = typing.Union[typing.Literal["standard", "neural", "long-form", "generative"], typing.Any] diff --git a/src/agora_agent/types/ares_asr.py b/src/agora_agent/types/ares_asr.py new file mode 100644 index 0000000..cf42216 --- /dev/null +++ b/src/agora_agent/types/ares_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .ares_asr_params import AresAsrParams +from .asr_language import AsrLanguage + + +class AresAsr(UncheckedBaseModel): + """ + Adaptive Recognition Engine for Speech ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: typing.Optional[AresAsrParams] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/ares_asr_params.py b/src/agora_agent/types/ares_asr_params.py new file mode 100644 index 0000000..afa1d76 --- /dev/null +++ b/src/agora_agent/types/ares_asr_params.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +AresAsrParams = typing.Dict[str, typing.Any] diff --git a/src/agora_agent/types/asr.py b/src/agora_agent/types/asr.py new file mode 100644 index 0000000..f08086f --- /dev/null +++ b/src/agora_agent/types/asr.py @@ -0,0 +1,172 @@ +# This file was auto-generated by Fern from our API Definition. + +from __future__ import annotations + +import typing + +import pydantic +import typing_extensions +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel, UnionMetadata +from .amazon_asr_params import AmazonAsrParams +from .ares_asr_params import AresAsrParams +from .asr_language import AsrLanguage +from .assembly_ai_asr_params import AssemblyAiAsrParams +from .deepgram_asr_params import DeepgramAsrParams +from .google_asr_params import GoogleAsrParams +from .microsoft_asr_params import MicrosoftAsrParams +from .open_ai_asr_params import OpenAiAsrParams +from .sarvam_asr_params import SarvamAsrParams +from .speechmatics_asr_params import SpeechmaticsAsrParams + + +class Asr_Ares(UncheckedBaseModel): + vendor: typing.Literal["ares"] = "ares" + language: typing.Optional[AsrLanguage] = None + params: typing.Optional[AresAsrParams] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Microsoft(UncheckedBaseModel): + vendor: typing.Literal["microsoft"] = "microsoft" + language: typing.Optional[AsrLanguage] = None + params: MicrosoftAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Deepgram(UncheckedBaseModel): + vendor: typing.Literal["deepgram"] = "deepgram" + language: typing.Optional[AsrLanguage] = None + params: DeepgramAsrParams + keyterm: typing.Optional[str] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Openai(UncheckedBaseModel): + vendor: typing.Literal["openai"] = "openai" + language: typing.Optional[AsrLanguage] = None + params: OpenAiAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Google(UncheckedBaseModel): + vendor: typing.Literal["google"] = "google" + language: typing.Optional[AsrLanguage] = None + params: GoogleAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Amazon(UncheckedBaseModel): + vendor: typing.Literal["amazon"] = "amazon" + language: typing.Optional[AsrLanguage] = None + params: AmazonAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Assemblyai(UncheckedBaseModel): + vendor: typing.Literal["assemblyai"] = "assemblyai" + language: typing.Optional[AsrLanguage] = None + params: AssemblyAiAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Speechmatics(UncheckedBaseModel): + vendor: typing.Literal["speechmatics"] = "speechmatics" + language: typing.Optional[AsrLanguage] = None + params: SpeechmaticsAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Sarvam(UncheckedBaseModel): + vendor: typing.Literal["sarvam"] = "sarvam" + language: typing.Optional[AsrLanguage] = None + params: SarvamAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +Asr = typing_extensions.Annotated[ + typing.Union[ + Asr_Ares, + Asr_Microsoft, + Asr_Deepgram, + Asr_Openai, + Asr_Google, + Asr_Amazon, + Asr_Assemblyai, + Asr_Speechmatics, + Asr_Sarvam, + ], + UnionMetadata(discriminant="vendor"), +] diff --git a/src/agora_agent/types/asr_language.py b/src/agora_agent/types/asr_language.py new file mode 100644 index 0000000..4ff3c88 --- /dev/null +++ b/src/agora_agent/types/asr_language.py @@ -0,0 +1,41 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +AsrLanguage = typing.Union[ + typing.Literal[ + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", + ], + typing.Any, +] diff --git a/src/agora_agent/types/assembly_ai_asr.py b/src/agora_agent/types/assembly_ai_asr.py new file mode 100644 index 0000000..ea2ebf4 --- /dev/null +++ b/src/agora_agent/types/assembly_ai_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .assembly_ai_asr_params import AssemblyAiAsrParams + + +class AssemblyAiAsr(UncheckedBaseModel): + """ + AssemblyAI ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: AssemblyAiAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/assembly_ai_asr_params.py b/src/agora_agent/types/assembly_ai_asr_params.py new file mode 100644 index 0000000..f3a5818 --- /dev/null +++ b/src/agora_agent/types/assembly_ai_asr_params.py @@ -0,0 +1,37 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class AssemblyAiAsrParams(UncheckedBaseModel): + """ + AssemblyAI ASR configuration parameters. + """ + + api_key: str = pydantic.Field() + """ + AssemblyAI API key + """ + + language: str = pydantic.Field() + """ + Language code for speech recognition + """ + + uri: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for AssemblyAI's streaming API + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/cartesia_tts_output_format.py b/src/agora_agent/types/cartesia_tts_output_format.py new file mode 100644 index 0000000..ab7e122 --- /dev/null +++ b/src/agora_agent/types/cartesia_tts_output_format.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class CartesiaTtsOutputFormat(UncheckedBaseModel): + """ + Cartesia audio output format configuration. + """ + + container: typing.Optional[str] = pydantic.Field(default=None) + """ + Audio container format for the output stream + """ + + sample_rate: typing.Optional[int] = pydantic.Field(default=None) + """ + Audio sampling rate in Hz + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/cartesia_tts_params.py b/src/agora_agent/types/cartesia_tts_params.py index 2aaf069..1478570 100644 --- a/src/agora_agent/types/cartesia_tts_params.py +++ b/src/agora_agent/types/cartesia_tts_params.py @@ -5,6 +5,7 @@ import pydantic from ..core.pydantic_utilities import IS_PYDANTIC_V2 from ..core.unchecked_base_model import UncheckedBaseModel +from .cartesia_tts_output_format import CartesiaTtsOutputFormat from .cartesia_tts_voice import CartesiaTtsVoice @@ -18,15 +19,21 @@ class CartesiaTtsParams(UncheckedBaseModel): Cartesia API key """ - voice: CartesiaTtsVoice - model_id: typing.Optional[str] = pydantic.Field(default=None) + model_id: str = pydantic.Field() """ - Model ID (optional) + Model ID (for example, sonic-2) """ - sample_rate: typing.Optional[int] = pydantic.Field(default=None) + base_url: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for the Cartesia streaming API + """ + + voice: CartesiaTtsVoice + output_format: typing.Optional[CartesiaTtsOutputFormat] = None + language: typing.Optional[str] = pydantic.Field(default=None) """ - Audio sampling rate in Hz + Target language for speech synthesis """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/types/deepgram_asr.py b/src/agora_agent/types/deepgram_asr.py new file mode 100644 index 0000000..1c79c7b --- /dev/null +++ b/src/agora_agent/types/deepgram_asr.py @@ -0,0 +1,31 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .deepgram_asr_params import DeepgramAsrParams + + +class DeepgramAsr(UncheckedBaseModel): + """ + Deepgram ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: DeepgramAsrParams + keyterm: typing.Optional[str] = pydantic.Field(default=None) + """ + Boost specialized terms and brands for preset-backed Deepgram usage. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/deepgram_asr_params.py b/src/agora_agent/types/deepgram_asr_params.py new file mode 100644 index 0000000..259958e --- /dev/null +++ b/src/agora_agent/types/deepgram_asr_params.py @@ -0,0 +1,47 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class DeepgramAsrParams(UncheckedBaseModel): + """ + Deepgram ASR configuration parameters. + """ + + url: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for Deepgram's streaming API + """ + + key: str = pydantic.Field() + """ + Deepgram API key + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + Speech recognition model + """ + + language: typing.Optional[str] = pydantic.Field(default=None) + """ + Language code for speech recognition + """ + + keyterm: typing.Optional[str] = pydantic.Field(default=None) + """ + Boost specialized terms and brands + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/eleven_labs_tts_params.py b/src/agora_agent/types/eleven_labs_tts_params.py index c6127fd..b61e3de 100644 --- a/src/agora_agent/types/eleven_labs_tts_params.py +++ b/src/agora_agent/types/eleven_labs_tts_params.py @@ -37,6 +37,31 @@ class ElevenLabsTtsParams(UncheckedBaseModel): Audio sample rate in Hz (16kHz for Akool, 24kHz for HeyGen) """ + speed: typing.Optional[float] = pydantic.Field(default=None) + """ + Speech speed multiplier. + """ + + stability: typing.Optional[float] = pydantic.Field(default=None) + """ + Voice stability. Higher values produce more consistent speech. + """ + + similarity_boost: typing.Optional[float] = pydantic.Field(default=None) + """ + Similarity boost for the selected voice. + """ + + style: typing.Optional[float] = pydantic.Field(default=None) + """ + Speaking style and expressiveness control. + """ + + use_speaker_boost: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to improve voice quality and similarity. + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/fish_audio_tts_params.py b/src/agora_agent/types/fish_audio_tts_params.py index 0ad77aa..60bcff4 100644 --- a/src/agora_agent/types/fish_audio_tts_params.py +++ b/src/agora_agent/types/fish_audio_tts_params.py @@ -12,7 +12,7 @@ class FishAudioTtsParams(UncheckedBaseModel): Fish Audio TTS configuration parameters. """ - key: str = pydantic.Field() + api_key: str = pydantic.Field() """ Fish Audio API key """ @@ -22,6 +22,11 @@ class FishAudioTtsParams(UncheckedBaseModel): Fish Audio reference ID """ + backend: typing.Optional[str] = pydantic.Field(default=None) + """ + Backend model version to use + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/google_asr.py b/src/agora_agent/types/google_asr.py new file mode 100644 index 0000000..8473a04 --- /dev/null +++ b/src/agora_agent/types/google_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .google_asr_params import GoogleAsrParams + + +class GoogleAsr(UncheckedBaseModel): + """ + Google ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: GoogleAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/google_asr_params.py b/src/agora_agent/types/google_asr_params.py new file mode 100644 index 0000000..9d17db6 --- /dev/null +++ b/src/agora_agent/types/google_asr_params.py @@ -0,0 +1,47 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class GoogleAsrParams(UncheckedBaseModel): + """ + Google ASR configuration parameters. + """ + + project_id: str = pydantic.Field() + """ + Google Cloud project ID + """ + + location: str = pydantic.Field() + """ + Google Cloud region for the speech service + """ + + adc_credentials_string: str = pydantic.Field() + """ + Google Cloud service account credentials JSON string + """ + + language: str = pydantic.Field() + """ + Language code for speech recognition + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + Recognition model to use + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/google_tts_audio_config.py b/src/agora_agent/types/google_tts_audio_config.py new file mode 100644 index 0000000..9c2a405 --- /dev/null +++ b/src/agora_agent/types/google_tts_audio_config.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class GoogleTtsAudioConfig(UncheckedBaseModel): + """ + Google audio output configuration. + """ + + speaking_rate: typing.Optional[float] = pydantic.Field(default=None) + """ + Speed of speech + """ + + sample_rate_hertz: typing.Optional[int] = pydantic.Field(default=None) + """ + Sample rate in Hz + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/google_tts_params.py b/src/agora_agent/types/google_tts_params.py index dc00322..4a9ee38 100644 --- a/src/agora_agent/types/google_tts_params.py +++ b/src/agora_agent/types/google_tts_params.py @@ -3,8 +3,12 @@ import typing import pydantic +import typing_extensions from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.serialization import FieldMetadata from ..core.unchecked_base_model import UncheckedBaseModel +from .google_tts_audio_config import GoogleTtsAudioConfig +from .google_tts_voice_selection_params import GoogleTtsVoiceSelectionParams class GoogleTtsParams(UncheckedBaseModel): @@ -12,25 +16,17 @@ class GoogleTtsParams(UncheckedBaseModel): Google TTS configuration parameters. """ - key: str = pydantic.Field() + credentials: str = pydantic.Field() """ - Google Cloud API key + Google Cloud service account credentials JSON string """ - voice_name: str = pydantic.Field() - """ - Google voice name - """ - - language_code: typing.Optional[str] = pydantic.Field(default=None) - """ - Language code (e.g., "en-US") - """ - - sample_rate_hertz: typing.Optional[int] = pydantic.Field(default=None) - """ - Sample rate in Hz (default depends on selected voice) - """ + voice_selection_params: typing_extensions.Annotated[ + GoogleTtsVoiceSelectionParams, FieldMetadata(alias="VoiceSelectionParams") + ] + audio_config: typing_extensions.Annotated[ + typing.Optional[GoogleTtsAudioConfig], FieldMetadata(alias="AudioConfig") + ] = None if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 diff --git a/src/agora_agent/types/google_tts_voice_selection_params.py b/src/agora_agent/types/google_tts_voice_selection_params.py new file mode 100644 index 0000000..ee75953 --- /dev/null +++ b/src/agora_agent/types/google_tts_voice_selection_params.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class GoogleTtsVoiceSelectionParams(UncheckedBaseModel): + """ + Google voice selection parameters. + """ + + name: str = pydantic.Field() + """ + Google voice name + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/hume_ai_tts_params.py b/src/agora_agent/types/hume_ai_tts_params.py index 08cb12b..1480fd4 100644 --- a/src/agora_agent/types/hume_ai_tts_params.py +++ b/src/agora_agent/types/hume_ai_tts_params.py @@ -5,6 +5,7 @@ import pydantic from ..core.pydantic_utilities import IS_PYDANTIC_V2 from ..core.unchecked_base_model import UncheckedBaseModel +from .hume_ai_tts_params_provider import HumeAiTtsParamsProvider class HumeAiTtsParams(UncheckedBaseModel): @@ -17,9 +18,34 @@ class HumeAiTtsParams(UncheckedBaseModel): Hume AI API key """ + voice_id: typing.Optional[str] = pydantic.Field(default=None) + """ + Hume AI voice ID + """ + + base_url: typing.Optional[str] = pydantic.Field(default=None) + """ + Base URL for the Hume AI API + """ + + provider: typing.Optional[HumeAiTtsParamsProvider] = pydantic.Field(default=None) + """ + Voice provider type + """ + + speed: typing.Optional[float] = pydantic.Field(default=None) + """ + Playback speed of the generated speech + """ + + trailing_silence: typing.Optional[float] = pydantic.Field(default=None) + """ + Duration of silence in seconds to add at the end of each utterance + """ + config_id: typing.Optional[str] = pydantic.Field(default=None) """ - Hume AI configuration ID + Hume AI configuration ID. Deprecated; use voice_id for the documented TTS shape. """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/types/hume_ai_tts_params_provider.py b/src/agora_agent/types/hume_ai_tts_params_provider.py new file mode 100644 index 0000000..cf07e73 --- /dev/null +++ b/src/agora_agent/types/hume_ai_tts_params_provider.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +HumeAiTtsParamsProvider = typing.Union[typing.Literal["HUME_AI", "CUSTOM_VOICE"], typing.Any] diff --git a/src/agora_agent/types/llm.py b/src/agora_agent/types/llm.py new file mode 100644 index 0000000..2b0283d --- /dev/null +++ b/src/agora_agent/types/llm.py @@ -0,0 +1,120 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .llm_params import LlmParams +from .llm_style import LlmStyle + + +class Llm(UncheckedBaseModel): + """ + Large language model (LLM) configuration. + """ + + url: typing.Optional[str] = pydantic.Field(default=None) + """ + The LLM callback address. + """ + + api_key: typing.Optional[str] = pydantic.Field(default=None) + """ + The LLM verification API key. + """ + + access_key: typing.Optional[str] = pydantic.Field(default=None) + """ + AWS access key ID. Used by Amazon Bedrock when api_key is not provided. + """ + + secret_key: typing.Optional[str] = pydantic.Field(default=None) + """ + AWS secret access key. Used by Amazon Bedrock when api_key is not provided. + """ + + region: typing.Optional[str] = pydantic.Field(default=None) + """ + AWS region. Used by Amazon Bedrock. + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + Top-level model identifier. Used by Amazon Bedrock. + """ + + system_messages: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) + """ + A set of predefined information used as input to the LLM. + """ + + params: typing.Optional[LlmParams] = None + max_history: typing.Optional[int] = pydantic.Field(default=None) + """ + The number of conversation history messages cached in the custom LLM. + """ + + input_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + LLM input modalities. + """ + + output_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + LLM output modalities. + """ + + greeting_message: typing.Optional[str] = pydantic.Field(default=None) + """ + Agent greeting. + """ + + failure_message: typing.Optional[str] = pydantic.Field(default=None) + """ + Prompt for agent activation failure. + """ + + vendor: typing.Optional[str] = pydantic.Field(default=None) + """ + LLM provider identifier. + """ + + style: typing.Optional[LlmStyle] = pydantic.Field(default=None) + """ + The request style for chat completion. + """ + + ignore_empty: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to handle empty Gemini responses. + """ + + greeting_configs: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) + """ + Agent greeting broadcast configuration. + """ + + template_variables: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) + """ + Template parameter configuration. + """ + + mcp_servers: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) + """ + MCP server configuration. + """ + + headers: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) + """ + Custom headers to include in requests to the LLM. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/llm_params.py b/src/agora_agent/types/llm_params.py new file mode 100644 index 0000000..f6df01f --- /dev/null +++ b/src/agora_agent/types/llm_params.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class LlmParams(UncheckedBaseModel): + """ + Additional LLM configuration parameters. + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + The LLM model identifier. + """ + + max_tokens: typing.Optional[int] = pydantic.Field(default=None) + """ + Maximum tokens in the response. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/llm_style.py b/src/agora_agent/types/llm_style.py new file mode 100644 index 0000000..8319ca1 --- /dev/null +++ b/src/agora_agent/types/llm_style.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +LlmStyle = typing.Union[typing.Literal["openai", "gemini", "anthropic", "dify", "bedrock"], typing.Any] diff --git a/src/agora_agent/types/microsoft_asr.py b/src/agora_agent/types/microsoft_asr.py new file mode 100644 index 0000000..f602e09 --- /dev/null +++ b/src/agora_agent/types/microsoft_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .microsoft_asr_params import MicrosoftAsrParams + + +class MicrosoftAsr(UncheckedBaseModel): + """ + Microsoft Azure ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: MicrosoftAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/microsoft_asr_params.py b/src/agora_agent/types/microsoft_asr_params.py new file mode 100644 index 0000000..bea79e4 --- /dev/null +++ b/src/agora_agent/types/microsoft_asr_params.py @@ -0,0 +1,42 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class MicrosoftAsrParams(UncheckedBaseModel): + """ + Microsoft Azure ASR configuration parameters. + """ + + key: str = pydantic.Field() + """ + Microsoft Azure API key + """ + + region: str = pydantic.Field() + """ + Azure region + """ + + language: str = pydantic.Field() + """ + Language code for speech recognition + """ + + phrase_list: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + Words or phrases to improve recognition accuracy + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/microsoft_tts_params.py b/src/agora_agent/types/microsoft_tts_params.py index 3c9e80c..12f441e 100644 --- a/src/agora_agent/types/microsoft_tts_params.py +++ b/src/agora_agent/types/microsoft_tts_params.py @@ -32,6 +32,16 @@ class MicrosoftTtsParams(UncheckedBaseModel): Audio sampling rate in Hz """ + speed: typing.Optional[float] = pydantic.Field(default=None) + """ + Speaking rate multiplier. Values between 0.5 and 2.0. + """ + + volume: typing.Optional[float] = pydantic.Field(default=None) + """ + Audio volume. Values between 0.0 and 100.0. + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/mllm.py b/src/agora_agent/types/mllm.py new file mode 100644 index 0000000..3bcdb95 --- /dev/null +++ b/src/agora_agent/types/mllm.py @@ -0,0 +1,88 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_params import MllmParams +from .mllm_turn_detection import MllmTurnDetection +from .mllm_vendor import MllmVendor + + +class Mllm(UncheckedBaseModel): + """ + Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing. + """ + + enable: typing.Optional[bool] = pydantic.Field(default=None) + """ + Enable Multimodal Large Language Model. + """ + + url: typing.Optional[str] = pydantic.Field(default=None) + """ + The MLLM WebSocket URL for real-time communication. + """ + + api_key: typing.Optional[str] = pydantic.Field(default=None) + """ + The API key used for MLLM authentication. + """ + + adc_credentials_string: typing.Optional[str] = pydantic.Field(default=None) + """ + Base64-encoded Google Cloud Application Default Credentials. Used by Vertex AI. + """ + + project_id: typing.Optional[str] = pydantic.Field(default=None) + """ + Google Cloud project ID. Used by Vertex AI. + """ + + location: typing.Optional[str] = pydantic.Field(default=None) + """ + Google Cloud location or region. Used by Vertex AI. + """ + + messages: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) + """ + Array of conversation items used for short-term memory management. + """ + + params: typing.Optional[MllmParams] = None + input_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + MLLM input modalities. + """ + + output_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + MLLM output modalities. + """ + + greeting_message: typing.Optional[str] = pydantic.Field(default=None) + """ + Agent greeting message. + """ + + failure_message: typing.Optional[str] = pydantic.Field(default=None) + """ + Agent failure message. + """ + + vendor: typing.Optional[MllmVendor] = pydantic.Field(default=None) + """ + MLLM provider. + """ + + turn_detection: typing.Optional[MllmTurnDetection] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_http_options.py b/src/agora_agent/types/mllm_http_options.py new file mode 100644 index 0000000..19baebb --- /dev/null +++ b/src/agora_agent/types/mllm_http_options.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class MllmHttpOptions(UncheckedBaseModel): + """ + HTTP request options for the MLLM provider. + """ + + api_version: typing.Optional[str] = pydantic.Field(default=None) + """ + API version to use. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_input_audio_transcription.py b/src/agora_agent/types/mllm_input_audio_transcription.py new file mode 100644 index 0000000..6bb3d9d --- /dev/null +++ b/src/agora_agent/types/mllm_input_audio_transcription.py @@ -0,0 +1,37 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class MllmInputAudioTranscription(UncheckedBaseModel): + """ + Configuration for audio input transcription. + """ + + language: typing.Optional[str] = pydantic.Field(default=None) + """ + Language of the input audio. + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + Model to use for transcription. + """ + + prompt: typing.Optional[str] = pydantic.Field(default=None) + """ + Text to guide the transcription model. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_params.py b/src/agora_agent/types/mllm_params.py new file mode 100644 index 0000000..5437b69 --- /dev/null +++ b/src/agora_agent/types/mllm_params.py @@ -0,0 +1,71 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_http_options import MllmHttpOptions +from .mllm_input_audio_transcription import MllmInputAudioTranscription + + +class MllmParams(UncheckedBaseModel): + """ + Additional MLLM configuration parameters. + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + The MLLM model identifier. + """ + + voice: typing.Optional[str] = pydantic.Field(default=None) + """ + Voice identifier for audio output. + """ + + instructions: typing.Optional[str] = pydantic.Field(default=None) + """ + System instructions that define the agent behavior or tone. + """ + + input_audio_transcription: typing.Optional[MllmInputAudioTranscription] = None + affective_dialog: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to enable Gemini affective dialog. + """ + + proactive_audio: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether Gemini may choose not to respond when no reply is needed. + """ + + transcribe_agent: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to transcribe the agent speech in real time. + """ + + transcribe_user: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to transcribe the user speech in real time. + """ + + http_options: typing.Optional[MllmHttpOptions] = None + language: typing.Optional[str] = pydantic.Field(default=None) + """ + Language code for xAI Grok speech recognition and synthesis. + """ + + sample_rate: typing.Optional[int] = pydantic.Field(default=None) + """ + Audio sample rate in Hz. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_turn_detection.py b/src/agora_agent/types/mllm_turn_detection.py new file mode 100644 index 0000000..2cd3503 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection.py @@ -0,0 +1,35 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_turn_detection_agora_vad_config import MllmTurnDetectionAgoraVadConfig +from .mllm_turn_detection_mode import MllmTurnDetectionMode +from .mllm_turn_detection_semantic_vad_config import MllmTurnDetectionSemanticVadConfig +from .mllm_turn_detection_server_vad_config import MllmTurnDetectionServerVadConfig + + +class MllmTurnDetection(UncheckedBaseModel): + """ + Turn detection configuration for the MLLM module. + """ + + mode: typing.Optional[MllmTurnDetectionMode] = pydantic.Field(default=None) + """ + Turn detection mode for MLLM. + """ + + agora_vad_config: typing.Optional[MllmTurnDetectionAgoraVadConfig] = None + server_vad_config: typing.Optional[MllmTurnDetectionServerVadConfig] = None + semantic_vad_config: typing.Optional[MllmTurnDetectionSemanticVadConfig] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_turn_detection_agora_vad_config.py b/src/agora_agent/types/mllm_turn_detection_agora_vad_config.py new file mode 100644 index 0000000..4168ef3 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_agora_vad_config.py @@ -0,0 +1,23 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class MllmTurnDetectionAgoraVadConfig(UncheckedBaseModel): + interrupt_duration_ms: typing.Optional[int] = None + prefix_padding_ms: typing.Optional[int] = None + silence_duration_ms: typing.Optional[int] = None + threshold: typing.Optional[float] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_turn_detection_mode.py b/src/agora_agent/types/mllm_turn_detection_mode.py new file mode 100644 index 0000000..f6cd693 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_mode.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +MllmTurnDetectionMode = typing.Union[typing.Literal["agora_vad", "server_vad", "semantic_vad"], typing.Any] diff --git a/src/agora_agent/types/mllm_turn_detection_semantic_vad_config.py b/src/agora_agent/types/mllm_turn_detection_semantic_vad_config.py new file mode 100644 index 0000000..aeaf440 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_semantic_vad_config.py @@ -0,0 +1,21 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_turn_detection_semantic_vad_config_eagerness import MllmTurnDetectionSemanticVadConfigEagerness + + +class MllmTurnDetectionSemanticVadConfig(UncheckedBaseModel): + eagerness: typing.Optional[MllmTurnDetectionSemanticVadConfigEagerness] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_turn_detection_semantic_vad_config_eagerness.py b/src/agora_agent/types/mllm_turn_detection_semantic_vad_config_eagerness.py new file mode 100644 index 0000000..dbf9b4d --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_semantic_vad_config_eagerness.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +MllmTurnDetectionSemanticVadConfigEagerness = typing.Union[typing.Literal["auto", "low", "medium", "high"], typing.Any] diff --git a/src/agora_agent/types/mllm_turn_detection_server_vad_config.py b/src/agora_agent/types/mllm_turn_detection_server_vad_config.py new file mode 100644 index 0000000..b2976b3 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_server_vad_config.py @@ -0,0 +1,31 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_turn_detection_server_vad_config_end_of_speech_sensitivity import ( + MllmTurnDetectionServerVadConfigEndOfSpeechSensitivity, +) +from .mllm_turn_detection_server_vad_config_start_of_speech_sensitivity import ( + MllmTurnDetectionServerVadConfigStartOfSpeechSensitivity, +) + + +class MllmTurnDetectionServerVadConfig(UncheckedBaseModel): + prefix_padding_ms: typing.Optional[int] = None + silence_duration_ms: typing.Optional[int] = None + threshold: typing.Optional[float] = None + idle_timeout_ms: typing.Optional[int] = None + start_of_speech_sensitivity: typing.Optional[MllmTurnDetectionServerVadConfigStartOfSpeechSensitivity] = None + end_of_speech_sensitivity: typing.Optional[MllmTurnDetectionServerVadConfigEndOfSpeechSensitivity] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py b/src/agora_agent/types/mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py similarity index 61% rename from src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py rename to src/agora_agent/types/mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py index e92d3f1..b9b3377 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py +++ b/src/agora_agent/types/mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py @@ -2,6 +2,6 @@ import typing -StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigEndOfSpeechSensitivity = typing.Union[ +MllmTurnDetectionServerVadConfigEndOfSpeechSensitivity = typing.Union[ typing.Literal["END_SENSITIVITY_HIGH", "END_SENSITIVITY_LOW"], typing.Any ] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py b/src/agora_agent/types/mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py similarity index 61% rename from src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py rename to src/agora_agent/types/mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py index 25860c1..90ccf51 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py +++ b/src/agora_agent/types/mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py @@ -2,6 +2,6 @@ import typing -StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigStartOfSpeechSensitivity = typing.Union[ +MllmTurnDetectionServerVadConfigStartOfSpeechSensitivity = typing.Union[ typing.Literal["START_SENSITIVITY_HIGH", "START_SENSITIVITY_LOW"], typing.Any ] diff --git a/src/agora_agent/types/mllm_vendor.py b/src/agora_agent/types/mllm_vendor.py new file mode 100644 index 0000000..61c4d1a --- /dev/null +++ b/src/agora_agent/types/mllm_vendor.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +MllmVendor = typing.Union[typing.Literal["openai", "gemini", "vertexai", "xai"], typing.Any] diff --git a/src/agora_agent/types/murf_tts_params.py b/src/agora_agent/types/murf_tts_params.py index 5107f62..94d68db 100644 --- a/src/agora_agent/types/murf_tts_params.py +++ b/src/agora_agent/types/murf_tts_params.py @@ -3,7 +3,9 @@ import typing import pydantic +import typing_extensions from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.serialization import FieldMetadata from ..core.unchecked_base_model import UncheckedBaseModel @@ -12,19 +14,44 @@ class MurfTtsParams(UncheckedBaseModel): Murf TTS configuration parameters. """ - key: str = pydantic.Field() + api_key: str = pydantic.Field() """ Murf API key """ - voice_id: str = pydantic.Field() + base_url: str = pydantic.Field() """ - Voice ID (e.g., Ariana, Natalie, Ken) + WebSocket endpoint for streaming TTS output """ - style: typing.Optional[str] = pydantic.Field(default=None) + voice_id: typing_extensions.Annotated[str, FieldMetadata(alias="voiceId")] = pydantic.Field() """ - Voice style (e.g., Angry, Sad, Conversational, Newscast) + Voice ID (e.g., Matthew) + """ + + locale: typing.Optional[str] = pydantic.Field(default=None) + """ + Locale for the selected voice + """ + + rate: typing.Optional[float] = pydantic.Field(default=None) + """ + Speech rate adjustment + """ + + pitch: typing.Optional[float] = pydantic.Field(default=None) + """ + Pitch adjustment + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + TTS model to use + """ + + sample_rate: typing.Optional[float] = pydantic.Field(default=None) + """ + Audio sample rate in Hz """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/types/open_ai_asr.py b/src/agora_agent/types/open_ai_asr.py new file mode 100644 index 0000000..eec2aab --- /dev/null +++ b/src/agora_agent/types/open_ai_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .open_ai_asr_params import OpenAiAsrParams + + +class OpenAiAsr(UncheckedBaseModel): + """ + OpenAI ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: OpenAiAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/open_ai_asr_params.py b/src/agora_agent/types/open_ai_asr_params.py new file mode 100644 index 0000000..a5fadc8 --- /dev/null +++ b/src/agora_agent/types/open_ai_asr_params.py @@ -0,0 +1,30 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .open_ai_input_audio_transcription import OpenAiInputAudioTranscription + + +class OpenAiAsrParams(UncheckedBaseModel): + """ + OpenAI ASR configuration parameters. + """ + + api_key: str = pydantic.Field() + """ + OpenAI API key + """ + + input_audio_transcription: OpenAiInputAudioTranscription + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/open_ai_input_audio_transcription.py b/src/agora_agent/types/open_ai_input_audio_transcription.py new file mode 100644 index 0000000..9db45b1 --- /dev/null +++ b/src/agora_agent/types/open_ai_input_audio_transcription.py @@ -0,0 +1,37 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class OpenAiInputAudioTranscription(UncheckedBaseModel): + """ + OpenAI audio transcription configuration. + """ + + model: str = pydantic.Field() + """ + OpenAI ASR model to use for transcription + """ + + prompt: str = pydantic.Field() + """ + Prompt that guides the transcription process + """ + + language: str = pydantic.Field() + """ + Language code to use for transcription + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/open_ai_tts_params.py b/src/agora_agent/types/open_ai_tts_params.py index 3839646..c8f6e51 100644 --- a/src/agora_agent/types/open_ai_tts_params.py +++ b/src/agora_agent/types/open_ai_tts_params.py @@ -14,7 +14,12 @@ class OpenAiTtsParams(UncheckedBaseModel): api_key: typing.Optional[str] = pydantic.Field(default=None) """ - OpenAI API key. Optional for Agora-managed OpenAI TTS usage. + OpenAI API key. Optional for preset-backed OpenAI TTS usage. + """ + + base_url: typing.Optional[str] = pydantic.Field(default=None) + """ + Endpoint URL for the OpenAI TTS service. """ voice: str = pydantic.Field() @@ -27,6 +32,16 @@ class OpenAiTtsParams(UncheckedBaseModel): Model name (e.g., "tts-1", "tts-1-hd") """ + instructions: typing.Optional[str] = pydantic.Field(default=None) + """ + Custom instructions for voice style, accent, pace, and tone. + """ + + speed: typing.Optional[float] = pydantic.Field(default=None) + """ + Speaking rate multiplier. + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/rime_tts_params.py b/src/agora_agent/types/rime_tts_params.py index 6d18375..aae3ef2 100644 --- a/src/agora_agent/types/rime_tts_params.py +++ b/src/agora_agent/types/rime_tts_params.py @@ -3,7 +3,9 @@ import typing import pydantic +import typing_extensions from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.serialization import FieldMetadata from ..core.unchecked_base_model import UncheckedBaseModel @@ -12,7 +14,7 @@ class RimeTtsParams(UncheckedBaseModel): Rime TTS configuration parameters. """ - key: str = pydantic.Field() + api_key: str = pydantic.Field() """ Rime API key """ @@ -22,9 +24,16 @@ class RimeTtsParams(UncheckedBaseModel): Rime speaker ID """ - model_id: typing.Optional[str] = pydantic.Field(default=None) + model_id: typing_extensions.Annotated[typing.Optional[str], FieldMetadata(alias="modelId")] = pydantic.Field( + default=None + ) """ - Model ID (optional) + Rime TTS model ID + """ + + base_url: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for the Rime streaming API """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/types/sarvam_asr.py b/src/agora_agent/types/sarvam_asr.py new file mode 100644 index 0000000..ec95847 --- /dev/null +++ b/src/agora_agent/types/sarvam_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .sarvam_asr_params import SarvamAsrParams + + +class SarvamAsr(UncheckedBaseModel): + """ + Sarvam ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: SarvamAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/sarvam_asr_params.py b/src/agora_agent/types/sarvam_asr_params.py new file mode 100644 index 0000000..f29769d --- /dev/null +++ b/src/agora_agent/types/sarvam_asr_params.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class SarvamAsrParams(UncheckedBaseModel): + """ + Sarvam ASR configuration parameters. + """ + + api_key: str = pydantic.Field() + """ + Sarvam API key + """ + + language: str = pydantic.Field() + """ + Language code for transcription. Set to unknown for automatic language detection. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/sarvam_tts_params.py b/src/agora_agent/types/sarvam_tts_params.py index 93457a4..855299f 100644 --- a/src/agora_agent/types/sarvam_tts_params.py +++ b/src/agora_agent/types/sarvam_tts_params.py @@ -5,6 +5,7 @@ import pydantic from ..core.pydantic_utilities import IS_PYDANTIC_V2 from ..core.unchecked_base_model import UncheckedBaseModel +from .sarvam_tts_params_target_language_code import SarvamTtsParamsTargetLanguageCode class SarvamTtsParams(UncheckedBaseModel): @@ -12,7 +13,7 @@ class SarvamTtsParams(UncheckedBaseModel): Sarvam TTS configuration parameters. """ - key: str = pydantic.Field() + api_subscription_key: str = pydantic.Field() """ Sarvam API subscription key """ @@ -22,11 +23,31 @@ class SarvamTtsParams(UncheckedBaseModel): Voice ID (e.g., anushka, abhilash, karun, hitesh, manisha, vidya, arya) """ - target_language_code: str = pydantic.Field() + target_language_code: SarvamTtsParamsTargetLanguageCode = pydantic.Field() """ Target language code (e.g., en-IN) """ + pitch: typing.Optional[float] = pydantic.Field(default=None) + """ + Pitch adjustment for the voice + """ + + pace: typing.Optional[float] = pydantic.Field(default=None) + """ + Speed of speech + """ + + loudness: typing.Optional[float] = pydantic.Field(default=None) + """ + Volume level of the speech + """ + + sample_rate: typing.Optional[float] = pydantic.Field(default=None) + """ + Audio sample rate in Hz + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/sarvam_tts_params_target_language_code.py b/src/agora_agent/types/sarvam_tts_params_target_language_code.py new file mode 100644 index 0000000..b1722ec --- /dev/null +++ b/src/agora_agent/types/sarvam_tts_params_target_language_code.py @@ -0,0 +1,8 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +SarvamTtsParamsTargetLanguageCode = typing.Union[ + typing.Literal["en-IN", "hi-IN", "bn-IN", "ta-IN", "te-IN", "kn-IN", "ml-IN", "mr-IN", "gu-IN", "pa-IN", "or-IN"], + typing.Any, +] diff --git a/src/agora_agent/types/speechmatics_asr.py b/src/agora_agent/types/speechmatics_asr.py new file mode 100644 index 0000000..644db25 --- /dev/null +++ b/src/agora_agent/types/speechmatics_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .speechmatics_asr_params import SpeechmaticsAsrParams + + +class SpeechmaticsAsr(UncheckedBaseModel): + """ + Speechmatics ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: SpeechmaticsAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/speechmatics_asr_params.py b/src/agora_agent/types/speechmatics_asr_params.py new file mode 100644 index 0000000..4709d22 --- /dev/null +++ b/src/agora_agent/types/speechmatics_asr_params.py @@ -0,0 +1,37 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class SpeechmaticsAsrParams(UncheckedBaseModel): + """ + Speechmatics ASR configuration parameters. + """ + + api_key: str = pydantic.Field() + """ + Speechmatics API key + """ + + language: str = pydantic.Field() + """ + Language code to use for transcription + """ + + uri: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for the Speechmatics streaming API + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/tests/custom/test_avatar_token.py b/tests/custom/test_avatar_token.py deleted file mode 100644 index fa73fc0..0000000 --- a/tests/custom/test_avatar_token.py +++ /dev/null @@ -1,12 +0,0 @@ -from agora_agent.agentkit import generate_convo_ai_token - - -def test_avatar_tokens_use_convo_ai_token_path_with_avatar_uid(): - token = generate_convo_ai_token( - app_id="0" * 32, - app_certificate="1" * 32, - channel_name="room", - uid=123, - ) - - assert token.startswith("007") diff --git a/tests/custom/test_llm_vendors.py b/tests/custom/test_llm_vendors.py deleted file mode 100644 index faca9bf..0000000 --- a/tests/custom/test_llm_vendors.py +++ /dev/null @@ -1,60 +0,0 @@ -from agora_agent import AmazonBedrock, CustomLLM, Dify, Groq, VertexAILLM - - -def test_groq_serializes_as_openai_compatible() -> None: - config = Groq(api_key="groq-key", model="llama-3.3-70b-versatile").to_config() - - assert config["url"] == "https://api.groq.com/openai/v1/chat/completions" - assert config["api_key"] == "groq-key" - assert config["style"] == "openai" - assert config["params"]["model"] == "llama-3.3-70b-versatile" - - -def test_custom_llm_marks_request_as_custom() -> None: - config = CustomLLM(api_key="key", model="model", base_url="https://llm.example.com/chat").to_config() - - assert config["url"] == "https://llm.example.com/chat" - assert config["api_key"] == "key" - assert config["vendor"] == "custom" - assert config["style"] == "openai" - - -def test_vertex_ai_llm_includes_project_routing() -> None: - config = VertexAILLM( - api_key="vertex-token", - model="gemini-2.0-flash", - project_id="project", - location="us-central1", - ).to_config() - - assert config["api_key"] == "vertex-token" - assert config["style"] == "gemini" - assert config["params"]["model"] == "gemini-2.0-flash" - assert config["params"]["project_id"] == "project" - assert config["params"]["location"] == "us-central1" - - -def test_amazon_bedrock_serializes_as_anthropic_style() -> None: - config = AmazonBedrock( - api_key="bedrock-key", - url="https://bedrock.example.com/messages", - model="anthropic.claude-3-5-sonnet-20241022-v2:0", - ).to_config() - - assert config["api_key"] == "bedrock-key" - assert config["style"] == "anthropic" - assert config["params"]["model"] == "anthropic.claude-3-5-sonnet-20241022-v2:0" - - -def test_dify_serializes_conversation_fields() -> None: - config = Dify( - api_key="dify-key", - url="https://api.dify.ai/v1/chat-messages", - user="user-1", - conversation_id="conversation-1", - ).to_config() - - assert config["api_key"] == "dify-key" - assert config["style"] == "dify" - assert config["params"]["user"] == "user-1" - assert config["params"]["conversation_id"] == "conversation-1" diff --git a/tests/custom/test_root_exports.py b/tests/custom/test_root_exports.py deleted file mode 100644 index 9b2f508..0000000 --- a/tests/custom/test_root_exports.py +++ /dev/null @@ -1,29 +0,0 @@ -import pytest - -import agora_agent -import agora_agent.agentkit as agentkit - - -def test_root_exports_match_agentkit_for_common_symbols() -> None: - for name in ("Agent", "DeepgramSTT", "OpenAI", "AgentPresets", "generate_rtc_token", "DataChannel"): - assert getattr(agora_agent, name) is getattr(agentkit, name) - - -def test_root_exports_fern_client_symbols() -> None: - assert agora_agent.Agora is not None - assert agora_agent.Area is not None - assert agora_agent.AsyncAgora is not None - - -def test_unknown_root_export_raises_attribute_error() -> None: - with pytest.raises(AttributeError): - _ = agora_agent.NotARealExportName - - -def test_dir_includes_agentkit_vendor_exports() -> None: - assert "DeepgramSTT" in dir(agora_agent) - - -def test_all_includes_agentkit_vendor_exports() -> None: - assert "DeepgramSTT" in agora_agent.__all__ - assert "OpenAI" in agora_agent.__all__ From df2c8d60b46bc39fbe16e13b3b2daeac0b51d0e7 Mon Sep 17 00:00:00 2001 From: fern-api <115122769+fern-api[bot]@users.noreply.github.com> Date: Tue, 2 Jun 2026 00:10:29 +0000 Subject: [PATCH 05/26] [fern-replay] Applied customizations Patches applied (5): - patch-64703bda: test(agentkit): add custom tests for v1.5.0 AgentKit behavior - patch-7c2d9d99: feat(agentkit): align session options and token uid handling - patch-7465fada: fix(agentkit): resolve Python session typing issues - patch-fae1249a: Re-export agora-agents API from legacy PyPI compatibility package The compat distribution delegates to agora_agent via __getattr__ and documents both import paths in its README. - patch-44c21c14: Re-export AgentKit symbols from agora_agent package root Extend __getattr__ and __all__ so vendor classes, presets, and helpers are importable via `from agora_agent import ...`. Add tests and update class docstring examples to use the root import path. Patches with unresolved conflicts (17): - patch-6e30398b: chore(agentkit): bump to v1.5.0 and expose v2.7 type aliases - patch-9df782b4: feat(agentkit): update MLLM and LLM vendor wrappers for v2.7 - patch-26706d73: feat(agentkit): add GenericAvatar and session-aware avatar validation - patch-9f491c63: feat(agentkit): update Agent builder and session lifecycle for v2.7 - patch-6c20f076: docs(agentkit): update v1.5.0 guides, reference, and changelog - patch-eaec58eb: refactor(agentkit): align deprecated vendor aliases with canonical names - patch-20245632: feat(agentkit): export type aliases and avatar token helpers - patch-972dd5bd: updated docs - patch-4323b470: rename python package to agora-agents - patch-d29165c4: make python compat package publishable - patch-fc9d93c3: Document agora-agents PyPI install name and migration notes - patch-87fc4488: Update docs to import from agora_agent package root - patch-923cf954: Prioritize app credentials and builder in Python docs Rewrite getting-started auth and quick-start for app credentials with the builder API. De-emphasize presets and align index, BYOK, and README with the recommended onboarding path. - patch-d475306b: Move package rename guidance to installation docs and protect manual paths in Fern ignore. Consolidate migration notes into the installation guide with next-step links, add a brief README pointer, and exclude README, compat, and workflow files from Fern generation. - patch-c9355576: Streamline Python docs and README for app-credentials-first onboarding. Remove duplicated low-level client examples from the README, de-emphasize legacy auth modes, refocus the low-level API guide on AgentKit with telephony escape hatches, and update Agora-managed model terminology. - patch-98ecb4d3: Add Groq, Vertex AI, Bedrock, Dify, and Custom LLM vendor helpers. Introduce named LLM vendor classes with correct request serialization, export them from the package root, and add tests covering each provider's config shape. - patch-a5097b8d: Document new LLM vendors and tighten onboarding docs. Add Groq, Vertex AI, Bedrock, Dify, and Custom LLM to vendor references, simplify README and index navigation, and align quick-start and terminology with Agora-managed model language. Run `fern-replay resolve` to apply these customizations. Patches absorbed by generator (3): - patch-b7f0c36c: feat(agentkit): release v2.0.0 updates - patch-4d32368c: Add compat-build CI job and harden dual-package PyPI publish Build and verify the compat wheel re-exports, gate publish on compat-build, simplify version checks with poetry version, wait for primary package on PyPI, and retry compat publish on failure. - patch-20109390: Fix PyPI publish auth and explicitly protect release workflow in Fern ignore. Use PYPI_API_TOKEN for primary and compat Poetry publishes, matching the v1.4.1 release flow, and list release.yml explicitly in .fernignore. The generator now produces these customizations natively. --- .fern/replay.lock | 12088 ++++++++++++++++++- src/agora_agent/agentkit/agent.py | 2 + src/agora_agent/agentkit/agent_session.py | 1 + src/agora_agent/agentkit/vendors/avatar.py | 43 + src/agora_agent/agentkit/vendors/mllm.py | 1 + tests/custom/test_agentkit_agent.py | 298 + tests/custom/test_agentkit_session.py | 383 + tests/custom/test_agentkit_vendors.py | 122 + tests/custom/test_avatar_token.py | 12 + tests/custom/test_llm_vendors.py | 60 + tests/custom/test_root_exports.py | 29 + 11 files changed, 13037 insertions(+), 2 deletions(-) create mode 100644 tests/custom/test_agentkit_agent.py create mode 100644 tests/custom/test_agentkit_session.py create mode 100644 tests/custom/test_agentkit_vendors.py create mode 100644 tests/custom/test_avatar_token.py create mode 100644 tests/custom/test_llm_vendors.py create mode 100644 tests/custom/test_root_exports.py diff --git a/.fern/replay.lock b/.fern/replay.lock index 536b6e4..fcd9525 100644 --- a/.fern/replay.lock +++ b/.fern/replay.lock @@ -6,5 +6,12089 @@ generations: timestamp: 2026-05-20T20:38:02.180Z cli_version: unknown generator_versions: {} -current_generation: a217c8ecfd919345831eebaca8295e292d65ebcf -patches: [] + - commit_sha: f652c69edbd1815c832fc9354c193090ac8dde8e + tree_hash: 6a32ee744683b30c1c77191210d46b16a2a78ca4 + timestamp: 2026-06-02T00:10:16.318Z + cli_version: unknown + generator_versions: + fernapi/fern-python-sdk: 4.37.0 +current_generation: f652c69edbd1815c832fc9354c193090ac8dde8e +patches: + - id: patch-6e30398b + content_hash: sha256:e99898e508e2d6cb9f134cc33e0b73c1c8acb845f5887924e0e38031a6e089c0 + original_commit: 6e30398b5dc6e8ff2681a442a4d6a49c7d866032 + original_message: "chore(agentkit): bump to v1.5.0 and expose v2.7 type aliases" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/__init__.py + - src/agora_agent/agentkit/vendors/__init__.py + patch_content: | + diff --git a/src/agora_agent/agentkit/__init__.py b/src/agora_agent/agentkit/__init__.py + index 1942bce..5ceda66 100644 + --- a/src/agora_agent/agentkit/__init__.py + +++ b/src/agora_agent/agentkit/__init__.py + @@ -1,13 +1,30 @@ + from .agent import ( + Agent, + + AgentConfig, + + AgentConfigUpdate, + + ConversationHistory, + + ConversationRole, + + ConversationSessionTurn, + + ConversationTurn, + + ConversationTurns, + StartAgentsRequestProperties, + + AvatarConfig, + + AvatarVendor, + GeofenceConfig, + + LlmConfig, + + LlmStyle, + + MllmConfig, + + MllmVendor, + RtcConfig, + + SttConfig, + + SttVendor, + + TtsConfig, + FillerWordsConfig, + FillerWordsTrigger, + FillerWordsTriggerFixedTimeConfig, + FillerWordsContent, + FillerWordsContentStaticConfig, + + FillerWordsContentSelectionRule, + TurnDetectionConfig, + TurnDetectionNestedConfig, + StartOfSpeechConfig, + @@ -37,9 +54,14 @@ from .agent import ( + InterruptionMode, + MllmTurnDetectionConfig, + MllmTurnDetectionMode, + + Labels, + LlmGreetingConfigs, + LlmGreetingConfigsMode, + McpServersItem, + + SessionInfo, + + SessionListResponse, + + SessionSummary, + + SpeakPriority, + ) + from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from ..agent_management.types.agent_think_agent_management_response import ( + @@ -57,8 +79,10 @@ from ..agent_management.types.agent_think_agent_management_request_on_speaking_a + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + @@ -112,6 +136,7 @@ from .vendors import ( + FishAudioTTS, + Gemini, + GeminiLive, + + GenericAvatar, + GoogleSTT, + GoogleTTS, + HeyGenAvatar, + @@ -132,14 +157,27 @@ from .vendors import ( + SarvamTTS, + SpeechmaticsSTT, + VertexAI, + + XaiGrok, + + XaiRealtime, + LiveAvatarAvatar, + ) + + __all__ = [ + "Agent", + + "AgentConfig", + + "AgentConfigUpdate", + # Return type of Agent.to_properties() + "StartAgentsRequestProperties", + # Top-level config types + + "LlmConfig", + + "LlmStyle", + + "SttConfig", + + "SttVendor", + + "TtsConfig", + + "MllmConfig", + + "MllmVendor", + + "AvatarConfig", + + "AvatarVendor", + "GeofenceConfig", + "RtcConfig", + "FillerWordsConfig", + @@ -147,6 +185,7 @@ __all__ = [ + "FillerWordsTriggerFixedTimeConfig", + "FillerWordsContent", + "FillerWordsContentStaticConfig", + + "FillerWordsContentSelectionRule", + # Turn detection types + "TurnDetectionConfig", + "TurnDetectionNestedConfig", + @@ -181,6 +220,7 @@ __all__ = [ + "InterruptionMode", + "MllmTurnDetectionConfig", + "MllmTurnDetectionMode", + + "Labels", + # Type-safe constants + "DataChannel", + "AudioScenario", + @@ -197,6 +237,15 @@ __all__ = [ + "AgentSession", + "AsyncAgentSession", + "AgentSessionOptions", + + "SessionInfo", + + "SessionListResponse", + + "SessionSummary", + + "ConversationHistory", + + "ConversationTurn", + + "ConversationRole", + + "ConversationTurns", + + "ConversationSessionTurn", + + "SpeakPriority", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + @@ -253,14 +302,19 @@ __all__ = [ + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + + "XaiGrok", + + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + + "GenericAvatar", + "is_heygen_avatar", + "is_live_avatar_avatar", + "is_akool_avatar", + "is_anam_avatar", + + "is_generic_avatar", + + "is_rtc_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + diff --git a/src/agora_agent/agentkit/vendors/__init__.py b/src/agora_agent/agentkit/vendors/__init__.py + index 0320843..689eab1 100644 + --- a/src/agora_agent/agentkit/vendors/__init__.py + +++ b/src/agora_agent/agentkit/vendors/__init__.py + @@ -11,9 +11,9 @@ from .base import ( + OpenAISampleRate, + SampleRate, + ) + -from .avatar import AkoolAvatar, AnamAvatar, HeyGenAvatar, LiveAvatarAvatar + +from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + -from .mllm import GeminiLive, OpenAIRealtime, VertexAI + +from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok, XaiRealtime + from .stt import ( + AmazonSTT, + AresSTT, + @@ -82,8 +82,11 @@ __all__ = [ + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + + "XaiGrok", + + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + + "GenericAvatar", + ] + theirs_snapshot: + src/agora_agent/agentkit/__init__.py: | + from .agent import ( + Agent, + AgentConfig, + AgentConfigUpdate, + ConversationHistory, + ConversationRole, + ConversationSessionTurn, + ConversationTurn, + ConversationTurns, + StartAgentsRequestProperties, + AvatarConfig, + AvatarVendor, + GeofenceConfig, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, + RtcConfig, + SttConfig, + SttVendor, + TtsConfig, + FillerWordsConfig, + FillerWordsTrigger, + FillerWordsTriggerFixedTimeConfig, + FillerWordsContent, + FillerWordsContentStaticConfig, + FillerWordsContentSelectionRule, + TurnDetectionConfig, + TurnDetectionNestedConfig, + StartOfSpeechConfig, + StartOfSpeechMode, + StartOfSpeechVadConfig, + StartOfSpeechKeywordsConfig, + StartOfSpeechDisabledConfig, + StartOfSpeechDisabledConfigStrategy, + EndOfSpeechConfig, + EndOfSpeechMode, + EndOfSpeechVadConfig, + EndOfSpeechSemanticConfig, + TurnDetectionType, + InterruptMode, + Eagerness, + SalConfig, + SalMode, + AdvancedFeatures, + SessionParams, + SessionParamsInput, + SilenceConfig, + SilenceAction, + FarewellConfig, + ParametersDataChannel, + ParametersAudioScenario, + InterruptionConfig, + InterruptionMode, + MllmTurnDetectionConfig, + MllmTurnDetectionMode, + Labels, + LlmGreetingConfigs, + LlmGreetingConfigsMode, + McpServersItem, + SessionInfo, + SessionListResponse, + SessionSummary, + SpeakPriority, + ) + from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .constants import ( + DataChannel, + AudioScenario, + SilenceActionValues, + SalModeValues, + GeofenceArea, + GeofenceExcludeArea, + FillerWordsSelectionRule, + TurnDetectionTypeValues, + ) + from .token import ( + GenerateConvoAITokenOptions, + GenerateTokenOptions, + MAX_EXPIRY_SECONDS, + generate_convo_ai_token, + generate_rtc_token, + expires_in_hours, + expires_in_minutes, + ) + from .presets import ( + AgentPresets, + DeepgramPresetModels, + MiniMaxPresetModels, + OpenAIPresetModels, + OpenAITtsPresetModels, + normalize_preset_input, + ) + from .vendors import ( + AkoolAvatar, + AmazonSTT, + AmazonTTS, + AnamAvatar, + Anthropic, + AresSTT, + AssemblyAISTT, + AzureOpenAI, + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + CartesiaTTS, + DeepgramSTT, + DeepgramTTS, + ElevenLabsSampleRate, + ElevenLabsTTS, + FishAudioTTS, + Gemini, + GeminiLive, + GenericAvatar, + GoogleSTT, + GoogleTTS, + HeyGenAvatar, + HumeAITTS, + MicrosoftSampleRate, + MicrosoftSTT, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAI, + OpenAIRealtime, + OpenAISampleRate, + OpenAISTT, + OpenAITTS, + RimeTTS, + SampleRate, + SarvamSTT, + SarvamTTS, + SpeechmaticsSTT, + VertexAI, + XaiGrok, + XaiRealtime, + LiveAvatarAvatar, + ) + + __all__ = [ + "Agent", + "AgentConfig", + "AgentConfigUpdate", + # Return type of Agent.to_properties() + "StartAgentsRequestProperties", + # Top-level config types + "LlmConfig", + "LlmStyle", + "SttConfig", + "SttVendor", + "TtsConfig", + "MllmConfig", + "MllmVendor", + "AvatarConfig", + "AvatarVendor", + "GeofenceConfig", + "RtcConfig", + "FillerWordsConfig", + "FillerWordsTrigger", + "FillerWordsTriggerFixedTimeConfig", + "FillerWordsContent", + "FillerWordsContentStaticConfig", + "FillerWordsContentSelectionRule", + # Turn detection types + "TurnDetectionConfig", + "TurnDetectionNestedConfig", + "StartOfSpeechConfig", + "StartOfSpeechMode", + "StartOfSpeechVadConfig", + "StartOfSpeechKeywordsConfig", + "StartOfSpeechDisabledConfig", + "StartOfSpeechDisabledConfigStrategy", + "EndOfSpeechConfig", + "EndOfSpeechMode", + "EndOfSpeechVadConfig", + "EndOfSpeechSemanticConfig", + # Deprecated turn detection types + "TurnDetectionType", + "InterruptMode", + "Eagerness", + # SAL types + "SalConfig", + "SalMode", + # Advanced features + "AdvancedFeatures", + # Session parameters types + "SessionParams", + "SessionParamsInput", + "SilenceConfig", + "SilenceAction", + "FarewellConfig", + "ParametersDataChannel", + "ParametersAudioScenario", + "InterruptionConfig", + "InterruptionMode", + "MllmTurnDetectionConfig", + "MllmTurnDetectionMode", + "Labels", + # Type-safe constants + "DataChannel", + "AudioScenario", + "SilenceActionValues", + "SalModeValues", + "GeofenceArea", + "GeofenceExcludeArea", + "FillerWordsSelectionRule", + "TurnDetectionTypeValues", + # LLM sub-types + "LlmGreetingConfigs", + "LlmGreetingConfigsMode", + "McpServersItem", + "AgentSession", + "AsyncAgentSession", + "AgentSessionOptions", + "SessionInfo", + "SessionListResponse", + "SessionSummary", + "ConversationHistory", + "ConversationTurn", + "ConversationRole", + "ConversationTurns", + "ConversationSessionTurn", + "SpeakPriority", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + "AgentThinkRequestOnSpeakingAction", + "AgentPresets", + "DeepgramPresetModels", + "OpenAIPresetModels", + "OpenAITtsPresetModels", + "MiniMaxPresetModels", + "normalize_preset_input", + "generate_rtc_token", + "GenerateTokenOptions", + "generate_convo_ai_token", + "GenerateConvoAITokenOptions", + "MAX_EXPIRY_SECONDS", + "expires_in_hours", + "expires_in_minutes", + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + "is_heygen_avatar", + "is_live_avatar_avatar", + "is_akool_avatar", + "is_anam_avatar", + "is_generic_avatar", + "is_rtc_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + src/agora_agent/agentkit/vendors/__init__.py: | + from .base import ( + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + ElevenLabsSampleRate, + GoogleTTSSampleRate, + MicrosoftSampleRate, + OpenAISampleRate, + SampleRate, + ) + from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok, XaiRealtime + from .stt import ( + AmazonSTT, + AresSTT, + AssemblyAISTT, + DeepgramSTT, + GoogleSTT, + MicrosoftSTT, + OpenAISTT, + SarvamSTT, + SpeechmaticsSTT, + ) + from .tts import ( + AmazonTTS, + CartesiaTTS, + DeepgramTTS, + ElevenLabsTTS, + FishAudioTTS, + GoogleTTS, + HumeAITTS, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAITTS, + RimeTTS, + SarvamTTS, + ) + + __all__ = [ + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "GoogleTTSSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + ] + status: unresolved + - id: patch-9df782b4 + content_hash: sha256:84c08fe3239d2ecb0b0a3ddd33b0dce4e7b012125be797aa83ca12893363b565 + original_commit: 9df782b46d872599f103078e30c5ded2053f2517 + original_message: "feat(agentkit): update MLLM and LLM vendor wrappers for v2.7" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/vendors/mllm.py + patch_content: |+ + From 9df782b46d872599f103078e30c5ded2053f2517 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 20 May 2026 20:57:54 -0400 + Subject: [PATCH] feat(agentkit): update MLLM and LLM vendor wrappers for v2.7 + + Adds xAI Grok Realtime and Vertex AI MLLM wrappers, and aligns MLLM + config serialization with the generated core types. LLM vendors now + accept typed greeting_configs and serialize them through the generated + model shape, including interruptable. + --- + src/agora_agent/agentkit/vendors/llm.py | 31 ++++-- + src/agora_agent/agentkit/vendors/mllm.py | 118 +++++++++++++++++------ + 2 files changed, 113 insertions(+), 36 deletions(-) + + diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py + index 7465c9f..6f74b43 100644 + --- a/src/agora_agent/agentkit/vendors/llm.py + +++ b/src/agora_agent/agentkit/vendors/llm.py + @@ -1,9 +1,14 @@ + -from typing import Any, Dict, List, Optional + +from typing import Any, Dict, List, Optional, Union + + from pydantic import BaseModel, ConfigDict, Field + + +from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + + StartAgentsRequestPropertiesLlmGreetingConfigs, + +) + from .base import BaseLLM + + +LlmGreetingConfigs = Union[StartAgentsRequestPropertiesLlmGreetingConfigs, Dict[str, Any]] + + + + def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Ensure each MCP server has transport set (API requires it). Default to streamable_http.""" + @@ -15,6 +20,14 @@ def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]] + result.append(item) + return result + + + + +def _dump_optional_model(value: Any) -> Any: + + if hasattr(value, "model_dump"): + + return value.model_dump(exclude_none=True) + + if hasattr(value, "dict"): + + return value.dict(exclude_none=True) + + return value + + + class OpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + @@ -31,7 +44,7 @@ class OpenAIOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -74,7 +87,7 @@ class OpenAI(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + @@ -104,7 +117,7 @@ class AzureOpenAIOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -150,7 +163,7 @@ class AzureOpenAI(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.mcp_servers is not None: + @@ -177,7 +190,7 @@ class AnthropicOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -216,7 +229,7 @@ class Anthropic(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + @@ -246,7 +259,7 @@ class GeminiOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -287,7 +300,7 @@ class Gemini(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py + index 5f6f940..cd6cd07 100644 + --- a/src/agora_agent/agentkit/vendors/mllm.py + +++ b/src/agora_agent/agentkit/vendors/mllm.py + @@ -1,3 +1,4 @@ + +import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + @@ -22,9 +23,7 @@ class OpenAIRealtimeOptions(BaseModel): + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + @@ -53,18 +52,97 @@ class OpenAIRealtime(BaseMLLM): + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + - if self.options.predefined_tools is not None: + - config["predefined_tools"] = self.options.predefined_tools + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + - if self.options.max_history is not None: + - config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + +# xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + +# is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + + + +class XaiGrokOptions(BaseModel): + + model_config = ConfigDict(extra="forbid") + + + + api_key: str = Field(..., description="xAI API key") + + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + + + +class XaiGrok(BaseMLLM): + + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + + + def __init__(self, **kwargs: Any): + + self.options = XaiGrokOptions(**kwargs) + + + + def to_config(self) -> Dict[str, Any]: + + params: Dict[str, Any] = dict(self.options.params or {}) + + if self.options.voice is not None: + + params["voice"] = self.options.voice + + if self.options.language is not None: + + params["language"] = self.options.language + + if self.options.sample_rate is not None: + + params["sample_rate"] = self.options.sample_rate + + + + config: Dict[str, Any] = { + + "vendor": "xai", + + "api_key": self.options.api_key, + + "url": self.options.url, + + "params": params, + + } + + + + if self.options.greeting_message is not None: + + config["greeting_message"] = self.options.greeting_message + + if self.options.input_modalities is not None: + + config["input_modalities"] = self.options.input_modalities + + if self.options.output_modalities is not None: + + config["output_modalities"] = self.options.output_modalities + + if self.options.messages is not None: + + config["messages"] = self.options.messages + + if self.options.failure_message is not None: + + config["failure_message"] = self.options.failure_message + + if self.options.turn_detection is not None: + + config["turn_detection"] = self.options.turn_detection + + + + return config + + + + + +class XaiRealtimeOptions(XaiGrokOptions): + + """Deprecated: use :class:`XaiGrokOptions` instead.""" + + + + def __init__(self, **data: Any): + + warnings.warn( + + "XaiRealtimeOptions is deprecated; use XaiGrokOptions instead.", + + DeprecationWarning, + + stacklevel=2, + + ) + + super().__init__(**data) + + + + + +class XaiRealtime(XaiGrok): + + """Deprecated: use :class:`XaiGrok` instead.""" + + + + def __init__(self, **kwargs: Any): + + warnings.warn( + + "XaiRealtime is deprecated; use XaiGrok instead.", + + DeprecationWarning, + + stacklevel=2, + + ) + + super().__init__(**kwargs) + + + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + @@ -81,28 +159,24 @@ class VertexAIOptions(BaseModel): + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + - params: Dict[str, Any] = { + - "model": self.options.model, + - "project_id": self.options.project_id, + - "location": self.options.location, + - "adc_credentials_string": self.options.adc_credentials_string, + - } + - + + # additional_params spread first so that explicit fields always win, + + # matching the TypeScript SDK. + + params: Dict[str, Any] = dict(self.options.additional_params or {}) + + params["model"] = self.options.model + + params["project_id"] = self.options.project_id + + params["location"] = self.options.location + + params["adc_credentials_string"] = self.options.adc_credentials_string + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + - if self.options.additional_params is not None: + - params.update(self.options.additional_params) + + config: Dict[str, Any] = { + "vendor": "vertexai", + @@ -119,12 +193,8 @@ class VertexAI(BaseMLLM): + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + - if self.options.predefined_tools is not None: + - config["predefined_tools"] = self.options.predefined_tools + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + - if self.options.max_history is not None: + - config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + @@ -145,9 +215,7 @@ class GeminiLiveOptions(BaseModel): + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + @@ -179,12 +247,8 @@ class GeminiLive(BaseMLLM): + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + - if self.options.predefined_tools is not None: + - config["predefined_tools"] = self.options.predefined_tools + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + - if self.options.max_history is not None: + - config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/vendors/llm.py: | + from typing import Any, Dict, List, Optional, Union + + from pydantic import BaseModel, ConfigDict, Field + + from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + StartAgentsRequestPropertiesLlmGreetingConfigs, + ) + from .base import BaseLLM + + LlmGreetingConfigs = Union[StartAgentsRequestPropertiesLlmGreetingConfigs, Dict[str, Any]] + + + def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Ensure each MCP server has transport set (API requires it). Default to streamable_http.""" + result = [] + for s in servers: + item = dict(s) + if item.get("transport") is None: + item["transport"] = "streamable_http" + result.append(item) + return result + + + def _dump_optional_model(value: Any) -> Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + class OpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: Optional[str] = Field(default=None, description="OpenAI API key") + model: str = Field(default="gpt-4o-mini", description="Model name") + base_url: Optional[str] = Field(default=None, description="Custom base URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + max_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class OpenAI(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # model is the default; explicit params entries extend/override it. + # This matches the TS SDK behaviour: { model, ...params }. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + + # Named fields take precedence over anything in the generic params dict. + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.base_url or "https://api.openai.com/v1/chat/completions", + "params": params, + "style": "openai", + "input_modalities": self.options.input_modalities or ["text"], + } + if self.options.api_key is not None: + config["api_key"] = self.options.api_key + if self.options.headers is not None: + config["headers"] = self.options.headers + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class AzureOpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Azure OpenAI API key") + endpoint: str = Field(..., description="Azure endpoint URL") + deployment_name: str = Field(..., description="Azure deployment name") + api_version: str = Field(default="2024-08-01-preview", description="Azure API version") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + max_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class AzureOpenAI(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AzureOpenAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + url = ( + f"{self.options.endpoint}/openai/deployments/" + f"{self.options.deployment_name}/chat/completions" + f"?api-version={self.options.api_version}" + ) + config: Dict[str, Any] = { + "url": url, + "api_key": self.options.api_key, + "vendor": self.options.vendor or "azure", + "style": "openai", + "input_modalities": self.options.input_modalities or ["text"], + } + + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if params: + config["params"] = params + if self.options.headers is not None: + config["headers"] = self.options.headers + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class AnthropicOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anthropic API key") + model: str = Field(default="claude-3-5-sonnet-20241022", description="Model name") + url: Optional[str] = Field(default=None, description="Custom API endpoint URL") + max_tokens: Optional[int] = Field(default=None, gt=0) + temperature: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class Anthropic(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AnthropicOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.url or "https://api.anthropic.com/v1/messages", + "api_key": self.options.api_key, + "params": params, + "style": "anthropic", + "input_modalities": self.options.input_modalities or ["text"], + } + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class GeminiOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google AI API key") + model: str = Field(default="gemini-2.0-flash-exp", description="Model name") + url: Optional[str] = Field(default=None, description="Custom API endpoint URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_k: Optional[int] = Field(default=None, gt=0) + max_output_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class Gemini(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + if self.options.top_k is not None: + params["top_k"] = self.options.top_k + if self.options.max_output_tokens is not None: + params["max_output_tokens"] = self.options.max_output_tokens + + config: Dict[str, Any] = { + "url": self.options.url or "https://generativelanguage.googleapis.com/v1beta/models", + "api_key": self.options.api_key, + "params": params, + "style": "gemini", + "input_modalities": self.options.input_modalities or ["text"], + } + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + src/agora_agent/agentkit/vendors/mllm.py: | + import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + + from ...agents.types.start_agents_request_properties_mllm_turn_detection import ( + StartAgentsRequestPropertiesMllmTurnDetection, + ) + from .base import BaseMLLM + + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + + + class OpenAIRealtimeOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="OpenAI API key") + model: Optional[str] = Field(default=None, description="Model name (e.g., gpt-4o-realtime-preview)") + url: Optional[str] = Field(default=None, description="WebSocket URL") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIRealtimeOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config: Dict[str, Any] = { + "vendor": "openai", + "api_key": self.options.api_key, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.model is not None: + params = {"model": self.options.model} + if self.options.params is not None: + params.update(self.options.params) + config["params"] = params + elif self.options.params is not None: + config["params"] = self.options.params + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + # xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + # is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + class XaiGrokOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="xAI API key") + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + class XaiGrok(BaseMLLM): + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + def __init__(self, **kwargs: Any): + self.options = XaiGrokOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.language is not None: + params["language"] = self.options.language + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate + + config: Dict[str, Any] = { + "vendor": "xai", + "api_key": self.options.api_key, + "url": self.options.url, + "params": params, + } + + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class XaiRealtimeOptions(XaiGrokOptions): + """Deprecated: use :class:`XaiGrokOptions` instead.""" + + def __init__(self, **data: Any): + warnings.warn( + "XaiRealtimeOptions is deprecated; use XaiGrokOptions instead.", + DeprecationWarning, + stacklevel=2, + ) + super().__init__(**data) + + + class XaiRealtime(XaiGrok): + """Deprecated: use :class:`XaiGrok` instead.""" + + def __init__(self, **kwargs: Any): + warnings.warn( + "XaiRealtime is deprecated; use XaiGrok instead.", + DeprecationWarning, + stacklevel=2, + ) + super().__init__(**kwargs) + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + model: str = Field(..., description="Model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud location/region") + adc_credentials_string: str = Field(..., description="Application Default Credentials JSON string") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name (e.g., Aoede, Charon)") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # additional_params spread first so that explicit fields always win, + # matching the TypeScript SDK. + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["model"] = self.options.model + params["project_id"] = self.options.project_id + params["location"] = self.options.location + params["adc_credentials_string"] = self.options.adc_credentials_string + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "vertexai", + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class GeminiLiveOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google API key") + model: str = Field(..., description="Gemini Live model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiLiveOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = {} + if self.options.additional_params is not None: + params.update(self.options.additional_params) + params["model"] = self.options.model + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "gemini", + "api_key": self.options.api_key, + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + status: unresolved + - id: patch-26706d73 + content_hash: sha256:a9551e0b774b96e7734e9faa7d770611861cf443837428272ef75710447238da + original_commit: 26706d73ae15d860d57daf926837632c01be7f10 + original_message: "feat(agentkit): add GenericAvatar and session-aware avatar validation" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/avatar_types.py + - src/agora_agent/agentkit/vendors/avatar.py + patch_content: |+ + From 26706d73ae15d860d57daf926837632c01be7f10 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 20 May 2026 20:59:22 -0400 + Subject: [PATCH] feat(agentkit): add GenericAvatar and session-aware avatar + validation + + Adds the GenericAvatar vendor wrapper and extends avatar validation + helpers for generic and RTC-backed avatars. Session-derived fields such + as agora_appid, agora_channel, and agora_token can now be validated + after AgentSession enrichment. + --- + src/agora_agent/agentkit/avatar_types.py | 35 +++++++++++++++++- + src/agora_agent/agentkit/vendors/avatar.py | 42 ++++++++++++++++++++++ + 2 files changed, 76 insertions(+), 1 deletion(-) + + diff --git a/src/agora_agent/agentkit/avatar_types.py b/src/agora_agent/agentkit/avatar_types.py + index 9e132a9..a04809c 100644 + --- a/src/agora_agent/agentkit/avatar_types.py + +++ b/src/agora_agent/agentkit/avatar_types.py + @@ -17,7 +17,21 @@ def is_anam_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "anam" + + + -def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: + +def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + + return config.get("vendor") == "generic" + + + + + +def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + + params = config.get("params", {}) + + return isinstance(params, dict) and bool(params.get("agora_uid")) and ( + + is_heygen_avatar(config) or is_live_avatar_avatar(config) or is_generic_avatar(config) + + ) + + + + + +def validate_avatar_config( + + config: typing.Dict[str, typing.Any], + + require_session_fields: bool = False, + +) -> None: + """Validates avatar configuration at runtime. + + Parameters + @@ -45,6 +59,8 @@ def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: + f"Invalid quality for {label}: {params.get('quality')}. " + f"Must be one of: {', '.join(valid_qualities)}" + ) + + if require_session_fields and not params.get("agora_token"): + + raise ValueError(f"{label} avatar requires agora_token after session enrichment") + elif is_akool_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + @@ -53,6 +69,23 @@ def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Anam avatar requires api_key") + + elif is_generic_avatar(config): + + params = config.get("params", {}) + + if not params.get("api_key"): + + raise ValueError("Generic avatar requires api_key") + + if not params.get("api_base_url"): + + raise ValueError("Generic avatar requires api_base_url") + + if not params.get("avatar_id"): + + raise ValueError("Generic avatar requires avatar_id") + + if not params.get("agora_uid"): + + raise ValueError("Generic avatar requires agora_uid") + + if require_session_fields: + + if not params.get("agora_token"): + + raise ValueError("Generic avatar requires agora_token after session enrichment") + + if not params.get("agora_appid"): + + raise ValueError("Generic avatar requires agora_appid after session enrichment") + + if not params.get("agora_channel"): + + raise ValueError("Generic avatar requires agora_channel after session enrichment") + + + def validate_tts_sample_rate( + diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py + index b83a356..00cad8f 100644 + --- a/src/agora_agent/agentkit/vendors/avatar.py + +++ b/src/agora_agent/agentkit/vendors/avatar.py + @@ -132,6 +132,48 @@ class LiveAvatarAvatar(BaseAvatar): + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + +class GenericAvatarOptions(BaseModel): + + model_config = ConfigDict(extra="forbid") + + + + api_key: str = Field(..., description="Generic avatar provider API key") + + api_base_url: str = Field(..., description="Avatar provider API base URL") + + avatar_id: str = Field(..., description="Avatar ID") + + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + +class GenericAvatar(BaseAvatar): + + def __init__(self, **kwargs: Any): + + self.options = GenericAvatarOptions(**kwargs) + + + + @property + + def required_sample_rate(self) -> int: + + return 0 + + + + def to_config(self) -> Dict[str, Any]: + + params: Dict[str, Any] = { + + "api_key": self.options.api_key, + + "api_base_url": self.options.api_base_url, + + "avatar_id": self.options.avatar_id, + + "agora_uid": self.options.agora_uid, + + } + + + + if self.options.agora_appid is not None: + + params["agora_appid"] = self.options.agora_appid + + if self.options.agora_token is not None: + + params["agora_token"] = self.options.agora_token + + if self.options.agora_channel is not None: + + params["agora_channel"] = self.options.agora_channel + + if self.options.additional_params is not None: + + params = {**self.options.additional_params, **params} + + + + enable = self.options.enable if self.options.enable is not None else True + + return {"enable": enable, "vendor": "generic", "params": params} + + + + + class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/avatar_types.py: | + import typing + + + def is_heygen_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "heygen" + + + def is_live_avatar_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "liveavatar" + + + def is_akool_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "akool" + + + def is_anam_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "anam" + + + def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "generic" + + + def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + params = config.get("params", {}) + return isinstance(params, dict) and bool(params.get("agora_uid")) and ( + is_heygen_avatar(config) or is_live_avatar_avatar(config) or is_generic_avatar(config) + ) + + + def validate_avatar_config( + config: typing.Dict[str, typing.Any], + require_session_fields: bool = False, + ) -> None: + """Validates avatar configuration at runtime. + + Parameters + ---------- + config : dict + The avatar configuration dictionary. + + Raises + ------ + ValueError + If the configuration is invalid. + """ + if is_heygen_avatar(config) or is_live_avatar_avatar(config): + label = "HeyGen" if is_heygen_avatar(config) else "LiveAvatar" + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError(f"{label} avatar requires api_key") + if not params.get("quality"): + raise ValueError(f"{label} avatar requires quality (low, medium, or high)") + if not params.get("agora_uid"): + raise ValueError(f"{label} avatar requires agora_uid") + valid_qualities = ("low", "medium", "high") + if params.get("quality") not in valid_qualities: + raise ValueError( + f"Invalid quality for {label}: {params.get('quality')}. " + f"Must be one of: {', '.join(valid_qualities)}" + ) + if require_session_fields and not params.get("agora_token"): + raise ValueError(f"{label} avatar requires agora_token after session enrichment") + elif is_akool_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Akool avatar requires api_key") + elif is_anam_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Anam avatar requires api_key") + elif is_generic_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Generic avatar requires api_key") + if not params.get("api_base_url"): + raise ValueError("Generic avatar requires api_base_url") + if not params.get("avatar_id"): + raise ValueError("Generic avatar requires avatar_id") + if not params.get("agora_uid"): + raise ValueError("Generic avatar requires agora_uid") + if require_session_fields: + if not params.get("agora_token"): + raise ValueError("Generic avatar requires agora_token after session enrichment") + if not params.get("agora_appid"): + raise ValueError("Generic avatar requires agora_appid after session enrichment") + if not params.get("agora_channel"): + raise ValueError("Generic avatar requires agora_channel after session enrichment") + + + def validate_tts_sample_rate( + avatar_config: typing.Dict[str, typing.Any], + tts_sample_rate: int, + ) -> None: + """Validates that TTS sample rate is compatible with the avatar vendor. + + Different avatar vendors have specific sample rate requirements: + - HeyGen: ONLY supports 24,000 Hz + - Akool: ONLY supports 16,000 Hz + + Parameters + ---------- + avatar_config : dict + The avatar configuration dictionary. + tts_sample_rate : int + The sample rate from your TTS configuration (in Hz). + + Raises + ------ + ValueError + If TTS sample rate is incompatible with the avatar vendor. + """ + if is_heygen_avatar(avatar_config) or is_live_avatar_avatar(avatar_config): + if tts_sample_rate != 24000: + label = "HeyGen" if is_heygen_avatar(avatar_config) else "LiveAvatar" + raise ValueError( + f"{label} avatars ONLY support 24,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 24kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/overview" + ) + elif is_akool_avatar(avatar_config): + if tts_sample_rate != 16000: + raise ValueError( + f"Akool avatars ONLY support 16,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 16kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/akool" + ) + src/agora_agent/agentkit/vendors/avatar.py: | + import warnings + from typing import Any, Dict, Optional + + from pydantic import BaseModel, ConfigDict, Field, field_validator + + from .base import BaseAvatar + + HEYGEN_SAMPLE_RATE = 24000 + LIVEAVATAR_SAMPLE_RATE = 24000 + AKOOL_SAMPLE_RATE = 16000 + + + class HeyGenAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="HeyGen API key") + quality: str = Field(..., description="Avatar quality: low, medium, or high") + agora_uid: str = Field(..., description="Agora UID for the avatar stream") + agora_token: Optional[str] = Field(default=None, description="RTC token for avatar authentication") + avatar_id: Optional[str] = Field(default=None, description="HeyGen avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + disable_idle_timeout: Optional[bool] = Field(default=None, description="Whether to disable idle timeout") + activity_idle_timeout: Optional[int] = Field(default=None, description="Idle timeout in seconds") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + @field_validator("quality") + @classmethod + def validate_quality(cls, v: str) -> str: + valid = ("low", "medium", "high") + if v not in valid: + raise ValueError(f"Invalid quality '{v}'. Must be one of: {', '.join(valid)}") + return v + + class HeyGenAvatar(BaseAvatar): + """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + def __init__(self, **kwargs: Any): + warnings.warn( + "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + DeprecationWarning, + stacklevel=2, + ) + self.options = HeyGenAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return HEYGEN_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "heygen", "params": params} + + + class AkoolAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Akool API key") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + class AkoolAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AkoolAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return AKOOL_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "akool", "params": params} + + + class LiveAvatarAvatarOptions(HeyGenAvatarOptions): + pass + + + class LiveAvatarAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = LiveAvatarAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return LIVEAVATAR_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + class GenericAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Generic avatar provider API key") + api_base_url: str = Field(..., description="Avatar provider API base URL") + avatar_id: str = Field(..., description="Avatar ID") + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "api_base_url": self.options.api_base_url, + "avatar_id": self.options.avatar_id, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_appid is not None: + params["agora_appid"] = self.options.agora_appid + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.agora_channel is not None: + params["agora_channel"] = self.options.agora_channel + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "generic", "params": params} + + + class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anam API key") + persona_id: Optional[str] = Field(default=None, description="Persona ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + class AnamAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AnamAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.persona_id is not None: + params["persona_id"] = self.options.persona_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "anam", "params": params} + status: unresolved + - id: patch-9f491c63 + content_hash: sha256:d9811b2c5927be74f2125444dcf36642b88ad7be422019688cb0228093dce1d0 + original_commit: 9f491c63a964c13c67ba4af3708379e1b75a92d8 + original_message: "feat(agentkit): update Agent builder and session lifecycle for v2.7" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + patch_content: |+ + From 9f491c63a964c13c67ba4af3708379e1b75a92d8 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 20 May 2026 21:00:58 -0400 + Subject: [PATCH] feat(agentkit): update Agent builder and session lifecycle + for v2.7 + + Aligns Agent and AgentSession with the generated v2.7 request shape. + MLLM sessions no longer require TTS, LLM, or STT, and enabled avatars + are rejected when MLLM is configured. AgentSession now enriches generic + and RTC avatars with session context, auto-generates avatar tokens, + validates TTS sample rates from vendor-specific fields, and adds + paginated get_turns/get_all_turns helpers with fail-fast pagination + guards. + --- + src/agora_agent/agentkit/agent.py | 164 +++++++++++++-- + src/agora_agent/agentkit/agent_session.py | 231 ++++++++++++++++++++-- + 2 files changed, 360 insertions(+), 35 deletions(-) + + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index 70a1bdd..86a958e 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -8,6 +8,24 @@ if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + +from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + +from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + +from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + +from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + +from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + +from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + +from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + +from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + +from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + +from ..agents.types.get_agents_response import GetAgentsResponse + +from ..agents.types.list_agents_response import ListAgentsResponse + +from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + +from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + +from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + +from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + +from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + +from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + +from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + @@ -46,10 +64,21 @@ from ..agents.types.start_agents_request_properties_filler_words_trigger import + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + +from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + +from ..types.tts import Tts + from .token import generate_convo_ai_token, _validate_expires_in + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + +LlmConfig = StartAgentsRequestPropertiesLlm + +LlmStyle = StartAgentsRequestPropertiesLlmStyle + +SttConfig = StartAgentsRequestPropertiesAsr + +SttVendor = StartAgentsRequestPropertiesAsrVendor + +TtsConfig = Tts + +MllmConfig = StartAgentsRequestPropertiesMllm + +MllmVendor = StartAgentsRequestPropertiesMllmVendor + +AvatarConfig = StartAgentsRequestPropertiesAvatar + +AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + @@ -93,6 +122,18 @@ InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + +AgentConfig = StartAgentsRequestProperties + +AgentConfigUpdate = UpdateAgentsRequestProperties + +SessionInfo = GetAgentsResponse + +SessionListResponse = ListAgentsResponse + +SessionSummary = ListAgentsResponseDataListItem + +ConversationHistory = GetHistoryAgentsResponse + +ConversationTurn = GetHistoryAgentsResponseContentsItem + +ConversationRole = GetHistoryAgentsResponseContentsItemRole + +ConversationTurns = GetTurnsAgentsResponse + +ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + +SpeakPriority = SpeakAgentsRequestPriority + +Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + @@ -116,6 +157,7 @@ FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + +FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + + class Agent: + @@ -183,9 +225,20 @@ class Agent: + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + + sample_rate = vendor.sample_rate + + if ( + + self._avatar_required_sample_rate not in (None, 0) + + and sample_rate is not None + + and sample_rate != self._avatar_required_sample_rate + + ): + + raise ValueError( + + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + + f"but TTS is configured with {sample_rate} Hz. " + + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + - new_agent._tts_sample_rate = vendor.sample_rate + + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + @@ -194,6 +247,9 @@ class Agent: + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + + # Note: avatars are not supported with MLLM. The combination is rejected + + # at ``to_properties`` / ``AgentSession.start`` so callers can still + + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + @@ -202,7 +258,10 @@ class Agent: + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + - advanced_features_model = new_agent._advanced_features.model_copy(update={"enable_mllm": None}) + + advanced_features_model = self._copy_model_update( + + new_agent._advanced_features, + + {"enable_mllm": None}, + + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + @@ -214,6 +273,10 @@ class Agent: + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + + # Note: avatars are not supported with MLLM. The combination is rejected + + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + + # enabled) so callers may still combine the two for testing or for the + + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + @@ -282,7 +345,10 @@ class Agent: + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + - new_agent._advanced_features = new_agent._advanced_features.model_copy(update={"enable_tools": enabled}) + + new_agent._advanced_features = self._copy_model_update( + + new_agent._advanced_features, + + {"enable_tools": enabled}, + + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + @@ -294,6 +360,23 @@ class Agent: + new_agent._parameters = parameters + return new_agent + + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + + """Returns a new Agent with the specified RTC audio scenario.""" + + new_agent = self._clone() + + if new_agent._parameters is None: + + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + + elif isinstance(new_agent._parameters, dict): + + new_agent._parameters = typing.cast( + + SessionParamsInput, + + {**new_agent._parameters, "audio_scenario": audio_scenario}, + + ) + + else: + + new_agent._parameters = self._copy_model_update( + + new_agent._parameters, + + {"audio_scenario": audio_scenario}, + + ) + + return new_agent + + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + @@ -342,6 +425,33 @@ class Agent: + new_agent._filler_words = filler_words + return new_agent + + + @staticmethod + + def _field_value(value: typing.Any, field: str) -> typing.Any: + + if value is None: + + return None + + if isinstance(value, dict): + + return value.get(field) + + return getattr(value, field, None) + + + + @staticmethod + + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + + if hasattr(value, "model_copy"): + + return value.model_copy(update=update) + + if hasattr(value, "copy"): + + return value.copy(update=update) + + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + + data_channel = self._field_value(self._parameters, "data_channel") + + if not enable_rtm or data_channel is not None: + + return self._parameters + + if self._parameters is None: + + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + + if isinstance(self._parameters, dict): + + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + + @property + def name(self) -> typing.Optional[str]: + return self._name + @@ -354,6 +464,10 @@ class Agent: + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + + @property + + def tts_sample_rate(self) -> typing.Optional[int]: + + return self._tts_sample_rate + + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + @@ -536,6 +650,20 @@ class Agent: + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + + # Validate the MLLM + enabled-avatar combination BEFORE generating the + + # RTC token so callers get a clear, actionable error first (matches the + + # TypeScript and Go SDKs' fail-fast contract). + + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + + avatar_enabled = ( + + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + + ) + + if is_mllm_mode and avatar_enabled: + + raise ValueError( + + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + + ) + + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + @@ -553,9 +681,6 @@ class Agent: + **token_kwargs, + ) + + - mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + - is_mllm_mode = bool(mllm_flag or self._mllm is not None) + - + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + @@ -579,11 +704,12 @@ class Agent: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + - if self._parameters is not None: + - if isinstance(self._parameters, dict): + - base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**self._parameters) + + parameters = self._resolved_parameters() + + if parameters is not None: + + if isinstance(parameters, dict): + + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + - base_kwargs["parameters"] = self._parameters + + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + @@ -596,12 +722,10 @@ class Agent: + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + - if self._greeting: + + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + - if self._failure_message: + + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + - if self._max_history is not None: + - mllm_config.setdefault("max_history", self._max_history) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + @@ -617,14 +741,14 @@ class Agent: + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + - if self._instructions: + + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + - if self._greeting: + - llm_config.setdefault("greeting_message", self._greeting) + - if self._failure_message: + - llm_config.setdefault("failure_message", self._failure_message) + + if self._greeting is not None: + + llm_config["greeting_message"] = self._greeting + + if self._failure_message is not None: + + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + - llm_config.setdefault("max_history", self._max_history) + + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index 2408659..e41a399 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -14,13 +14,16 @@ from ..agent_management.types.agent_think_agent_management_request_on_thinking_a + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + +from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + @@ -182,17 +185,29 @@ class _AgentSessionBase: + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + + if self._is_mllm_mode(): + + raise ValueError( + + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + - sample_rate = tts_params.get("sample_rate") if isinstance(tts_params, dict) else None + + sample_rate = self._agent.tts_sample_rate + + if sample_rate is None and isinstance(tts_params, dict): + + sample_rate = ( + + tts_params.get("sample_rate") + + or tts_params.get("sample_rate_hertz") + + or tts_params.get("samplingRate") + + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + @@ -211,6 +226,50 @@ class _AgentSessionBase: + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + + avatar = properties.get("avatar") + + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + + return + + + + params = avatar.get("params") + + if not isinstance(params, dict): + + params = {} + + avatar["params"] = params + + + + if is_generic_avatar(avatar): + + if not params.get("agora_appid"): + + params["agora_appid"] = self._app_id + + if not params.get("agora_channel"): + + params["agora_channel"] = self._channel + + + + if not is_rtc_avatar(avatar): + + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + + return + + + + if not params.get("agora_token"): + + if not self._app_certificate: + + raise ValueError( + + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + + ) + + token_kwargs: typing.Dict[str, typing.Any] = {} + + if self._expires_in is not None: + + token_kwargs["token_expire"] = self._expires_in + + params["agora_token"] = generate_convo_ai_token( + + app_id=self._app_id, + + app_certificate=self._app_certificate, + + channel_name=self._channel, + + account=str(params["agora_uid"]), + + **token_kwargs, + + ) + + + + if str(params.get("agora_uid")) == self._agent_uid: + + self._warn( + + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + + ) + + + + validate_avatar_config(avatar, require_session_fields=True) + + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + @@ -238,12 +297,17 @@ class _AgentSessionBase: + **token_opts, + ) + properties = self._dump_model(base_properties) + + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + - mllm = dict(self._agent.mllm) + - if self._agent.greeting: + + mllm = self._dump_model(self._agent.mllm) + + if not isinstance(mllm, dict): + + mllm = {} + + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + + if self._agent.failure_message is not None: + + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + @@ -251,20 +315,41 @@ class _AgentSessionBase: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + - if self._agent.instructions: + + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + - if self._agent.greeting: + - llm.setdefault("greeting_message", self._agent.greeting) + - if self._agent.failure_message: + - llm.setdefault("failure_message", self._agent.failure_message) + + if self._agent.greeting is not None: + + llm["greeting_message"] = self._agent.greeting + + if self._agent.failure_message is not None: + + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + - llm.setdefault("max_history", self._agent.max_history) + + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + + @staticmethod + + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + + if pagination is None: + + return None + + if isinstance(pagination, dict): + + return pagination.get(field) + + return getattr(pagination, field, None) + + + + @staticmethod + + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + + return list(turns or []) + + + + @classmethod + + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + + data = cls._dump_model(first_response) + + if not isinstance(data, dict): + + data = {} + + data["turns"] = turns + + return GetTurnsAgentsResponse(**data) + + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + @@ -484,7 +569,12 @@ class AgentSession(_AgentSessionBase): + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + - """Inject a custom text instruction into the current session pipeline.""" + + """Inject a custom text instruction into the current session pipeline. + + + + In API v2.7, omitting ``on_listening_action`` uses the server default + + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + + preserve the pre-v2.7 behavior. + + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + @@ -547,15 +637,68 @@ class AgentSession(_AgentSessionBase): + self._app_id, self._agent_id, request_options=self._request_options() + ) + + - def get_turns(self) -> typing.Any: + + def get_turns( + + self, + + *, + + page_index: typing.Optional[int] = None, + + page_size: typing.Optional[int] = None, + + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_turns( + - self._app_id, self._agent_id, request_options=self._request_options() + + self._app_id, + + self._agent_id, + + page_index=page_index, + + page_size=page_size, + + request_options=self._request_options(), + ) + + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + + """Get all turn analytics pages for this session. + + + + Raises ``RuntimeError`` if the server's pagination metadata is missing + + the fields required to advance, or if requesting the next page returns + + a page index that did not advance. + + """ + + response = self.get_turns(page_index=1, page_size=page_size) + + all_turns = self._response_turns(response) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + current_page = self._page_value(pagination, "page_index") or 1 + + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + + total_pages = self._page_value(pagination, "total_pages") + + returned_index = self._page_value(pagination, "page_index") + + if returned_index is None and total_pages is None: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + if total_pages is not None and current_page >= total_pages: + + break + + next_page = current_page + 1 + + response = self.get_turns(page_index=next_page, page_size=page_size) + + all_turns.extend(self._response_turns(response)) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + returned_index = self._page_value(pagination, "page_index") if pagination else None + + if returned_index is not None: + + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + + raise RuntimeError( + + f"get_all_turns pagination did not advance: requested page {next_page}, " + + f"received page {returned_index}." + + ) + + current_page = returned_index + + else: + + total_pages = self._page_value(pagination, "total_pages") if pagination else None + + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + + if total_pages is None and is_last_page is not True: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + current_page = next_page + + return self._with_all_turns(response, all_turns) + + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + @@ -734,7 +877,12 @@ class AsyncAgentSession(_AgentSessionBase): + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + - """Inject a custom text instruction into the current session pipeline.""" + + """Inject a custom text instruction into the current session pipeline. + + + + In API v2.7, omitting ``on_listening_action`` uses the server default + + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + + preserve the pre-v2.7 behavior. + + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + @@ -797,11 +945,64 @@ class AsyncAgentSession(_AgentSessionBase): + self._app_id, self._agent_id, request_options=self._request_options() + ) + + - async def get_turns(self) -> typing.Any: + + async def get_turns( + + self, + + *, + + page_index: typing.Optional[int] = None, + + page_size: typing.Optional[int] = None, + + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_turns( + - self._app_id, self._agent_id, request_options=self._request_options() + + self._app_id, + + self._agent_id, + + page_index=page_index, + + page_size=page_size, + + request_options=self._request_options(), + ) + + + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + + """Get all turn analytics pages for this session. + + + + Raises ``RuntimeError`` if the server's pagination metadata is missing + + the fields required to advance, or if requesting the next page returns + + a page index that did not advance. + + """ + + response = await self.get_turns(page_index=1, page_size=page_size) + + all_turns = self._response_turns(response) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + current_page = self._page_value(pagination, "page_index") or 1 + + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + + total_pages = self._page_value(pagination, "total_pages") + + returned_index = self._page_value(pagination, "page_index") + + if returned_index is None and total_pages is None: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + if total_pages is not None and current_page >= total_pages: + + break + + next_page = current_page + 1 + + response = await self.get_turns(page_index=next_page, page_size=page_size) + + all_turns.extend(self._response_turns(response)) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + returned_index = self._page_value(pagination, "page_index") if pagination else None + + if returned_index is not None: + + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + + raise RuntimeError( + + f"get_all_turns pagination did not advance: requested page {next_page}, " + + f"received page {returned_index}." + + ) + + current_page = returned_index + + else: + + total_pages = self._page_value(pagination, "total_pages") if pagination else None + + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + + if total_pages is None and is_last_page is not True: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + current_page = next_page + + return self._with_all_turns(response, all_turns) + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection + from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode + from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs + from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode + from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from .token import generate_convo_ai_token, _validate_expires_in + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + MllmVendor = StartAgentsRequestPropertiesMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + # LLM sub-type aliases + LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs + LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode + McpServersItem = StartAgentsRequestPropertiesLlmMcpServersItem + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Examples + -------- + >>> from agora_agent.agentkit import Agent + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + ): + self._name = name + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + The failure message is played via TTS when the LLM call fails. + """ + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Returns a new Agent with the specified maximum conversation history length.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + account=agent_uid, + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + if self._stt is not None: + base_kwargs["asr"] = self._stt + + return StartAgentsRequestProperties(**base_kwargs) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + account=self._agent_uid, + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_rtc_avatar(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + account=str(params["agora_uid"]), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + status: unresolved + - id: patch-eaec58eb + content_hash: sha256:8390ced175326080fc76021a97d315e71229bbc9ad70eef35a63eb9968df7830 + original_commit: eaec58eb2edfe03b1311a32dd137a867edf5d096 + original_message: "refactor(agentkit): align deprecated vendor aliases with canonical names" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/vendors/__init__.py + - src/agora_agent/agentkit/vendors/avatar.py + - src/agora_agent/agentkit/vendors/mllm.py + patch_content: |+ + diff --git a/src/agora_agent/agentkit/vendors/__init__.py b/src/agora_agent/agentkit/vendors/__init__.py + index 689eab1..8e2042e 100644 + --- a/src/agora_agent/agentkit/vendors/__init__.py + +++ b/src/agora_agent/agentkit/vendors/__init__.py + @@ -13,7 +13,7 @@ from .base import ( + ) + from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + -from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok, XaiRealtime + +from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok + from .stt import ( + AmazonSTT, + AresSTT, + @@ -83,7 +83,6 @@ __all__ = [ + "GeminiLive", + "VertexAI", + "XaiGrok", + - "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py + index 00cad8f..50bdd08 100644 + --- a/src/agora_agent/agentkit/vendors/avatar.py + +++ b/src/agora_agent/agentkit/vendors/avatar.py + @@ -5,19 +5,19 @@ from pydantic import BaseModel, ConfigDict, Field, field_validator + + from .base import BaseAvatar + + -HEYGEN_SAMPLE_RATE = 24000 + LIVEAVATAR_SAMPLE_RATE = 24000 + +HEYGEN_SAMPLE_RATE = LIVEAVATAR_SAMPLE_RATE + AKOOL_SAMPLE_RATE = 16000 + + + -class HeyGenAvatarOptions(BaseModel): + +class LiveAvatarAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + - api_key: str = Field(..., description="HeyGen API key") + + api_key: str = Field(..., description="LiveAvatar API key") + quality: str = Field(..., description="Avatar quality: low, medium, or high") + agora_uid: str = Field(..., description="Agora UID for the avatar stream") + agora_token: Optional[str] = Field(default=None, description="RTC token for avatar authentication") + - avatar_id: Optional[str] = Field(default=None, description="HeyGen avatar ID") + + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + disable_idle_timeout: Optional[bool] = Field(default=None, description="Whether to disable idle timeout") + activity_idle_timeout: Optional[int] = Field(default=None, description="Idle timeout in seconds") + @@ -31,20 +31,14 @@ class HeyGenAvatarOptions(BaseModel): + raise ValueError(f"Invalid quality '{v}'. Must be one of: {', '.join(valid)}") + return v + + -class HeyGenAvatar(BaseAvatar): + - """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + +class LiveAvatarAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + - warnings.warn( + - "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + - DeprecationWarning, + - stacklevel=2, + - ) + - self.options = HeyGenAvatarOptions(**kwargs) + + self.options = LiveAvatarAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + - return HEYGEN_SAMPLE_RATE + + return LIVEAVATAR_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + @@ -65,71 +59,79 @@ class HeyGenAvatar(BaseAvatar): + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + - return {"enable": enable, "vendor": "heygen", "params": params} + + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + -class AkoolAvatarOptions(BaseModel): + - model_config = ConfigDict(extra="forbid") + +class HeyGenAvatarOptions(LiveAvatarAvatarOptions): + + """Deprecated: use :class:`LiveAvatarAvatarOptions` instead.""" + + - api_key: str = Field(..., description="Akool API key") + - avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + - enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + - additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + -class AkoolAvatar(BaseAvatar): + +class HeyGenAvatar(BaseAvatar): + + """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + + def __init__(self, **kwargs: Any): + - self.options = AkoolAvatarOptions(**kwargs) + + warnings.warn( + + "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + + DeprecationWarning, + + stacklevel=2, + + ) + + self.options = HeyGenAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + - return AKOOL_SAMPLE_RATE + + return HEYGEN_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + + "quality": self.options.quality, + + "agora_uid": self.options.agora_uid, + } + + + if self.options.agora_token is not None: + + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + + if self.options.disable_idle_timeout is not None: + + params["disable_idle_timeout"] = self.options.disable_idle_timeout + + if self.options.activity_idle_timeout is not None: + + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + - return {"enable": enable, "vendor": "akool", "params": params} + + return {"enable": enable, "vendor": "heygen", "params": params} + + + -class LiveAvatarAvatarOptions(HeyGenAvatarOptions): + - pass + +class AkoolAvatarOptions(BaseModel): + + model_config = ConfigDict(extra="forbid") + + + + api_key: str = Field(..., description="Akool API key") + + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + -class LiveAvatarAvatar(BaseAvatar): + +class AkoolAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + - self.options = LiveAvatarAvatarOptions(**kwargs) + + self.options = AkoolAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + - return LIVEAVATAR_SAMPLE_RATE + + return AKOOL_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + - "quality": self.options.quality, + - "agora_uid": self.options.agora_uid, + } + + - if self.options.agora_token is not None: + - params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + - if self.options.disable_idle_timeout is not None: + - params["disable_idle_timeout"] = self.options.disable_idle_timeout + - if self.options.activity_idle_timeout is not None: + - params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + - return {"enable": enable, "vendor": "liveavatar", "params": params} + + return {"enable": enable, "vendor": "akool", "params": params} + + + class GenericAvatarOptions(BaseModel): + @@ -145,6 +147,7 @@ class GenericAvatarOptions(BaseModel): + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + + class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + @@ -178,10 +181,11 @@ class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anam API key") + - persona_id: Optional[str] = Field(default=None, description="Persona ID") + + persona_id: Optional[str] = Field(default=None, description="Anam persona ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + + class AnamAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AnamAvatarOptions(**kwargs) + diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py + index cd6cd07..b58f040 100644 + --- a/src/agora_agent/agentkit/vendors/mllm.py + +++ b/src/agora_agent/agentkit/vendors/mllm.py + @@ -1,4 +1,3 @@ + -import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + @@ -119,30 +118,6 @@ class XaiGrok(BaseMLLM): + return config + + + -class XaiRealtimeOptions(XaiGrokOptions): + - """Deprecated: use :class:`XaiGrokOptions` instead.""" + - + - def __init__(self, **data: Any): + - warnings.warn( + - "XaiRealtimeOptions is deprecated; use XaiGrokOptions instead.", + - DeprecationWarning, + - stacklevel=2, + - ) + - super().__init__(**data) + - + - + -class XaiRealtime(XaiGrok): + - """Deprecated: use :class:`XaiGrok` instead.""" + - + - def __init__(self, **kwargs: Any): + - warnings.warn( + - "XaiRealtime is deprecated; use XaiGrok instead.", + - DeprecationWarning, + - stacklevel=2, + - ) + - super().__init__(**kwargs) + - + - + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + theirs_snapshot: + src/agora_agent/agentkit/vendors/__init__.py: | + from .base import ( + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + ElevenLabsSampleRate, + GoogleTTSSampleRate, + MicrosoftSampleRate, + OpenAISampleRate, + SampleRate, + ) + from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok + from .stt import ( + AmazonSTT, + AresSTT, + AssemblyAISTT, + DeepgramSTT, + GoogleSTT, + MicrosoftSTT, + OpenAISTT, + SarvamSTT, + SpeechmaticsSTT, + ) + from .tts import ( + AmazonTTS, + CartesiaTTS, + DeepgramTTS, + ElevenLabsTTS, + FishAudioTTS, + GoogleTTS, + HumeAITTS, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAITTS, + RimeTTS, + SarvamTTS, + ) + + __all__ = [ + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "GoogleTTSSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + ] + src/agora_agent/agentkit/vendors/avatar.py: | + import warnings + from typing import Any, Dict, Optional + + from pydantic import BaseModel, ConfigDict, Field, field_validator + + from .base import BaseAvatar + + LIVEAVATAR_SAMPLE_RATE = 24000 + HEYGEN_SAMPLE_RATE = LIVEAVATAR_SAMPLE_RATE + AKOOL_SAMPLE_RATE = 16000 + + + class LiveAvatarAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="LiveAvatar API key") + quality: str = Field(..., description="Avatar quality: low, medium, or high") + agora_uid: str = Field(..., description="Agora UID for the avatar stream") + agora_token: Optional[str] = Field(default=None, description="RTC token for avatar authentication") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + disable_idle_timeout: Optional[bool] = Field(default=None, description="Whether to disable idle timeout") + activity_idle_timeout: Optional[int] = Field(default=None, description="Idle timeout in seconds") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + @field_validator("quality") + @classmethod + def validate_quality(cls, v: str) -> str: + valid = ("low", "medium", "high") + if v not in valid: + raise ValueError(f"Invalid quality '{v}'. Must be one of: {', '.join(valid)}") + return v + + + class LiveAvatarAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = LiveAvatarAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return LIVEAVATAR_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + class HeyGenAvatarOptions(LiveAvatarAvatarOptions): + """Deprecated: use :class:`LiveAvatarAvatarOptions` instead.""" + + + class HeyGenAvatar(BaseAvatar): + """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + def __init__(self, **kwargs: Any): + warnings.warn( + "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + DeprecationWarning, + stacklevel=2, + ) + self.options = HeyGenAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return HEYGEN_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "heygen", "params": params} + + + class AkoolAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Akool API key") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + class AkoolAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AkoolAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return AKOOL_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "akool", "params": params} + + + class GenericAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Generic avatar provider API key") + api_base_url: str = Field(..., description="Avatar provider API base URL") + avatar_id: str = Field(..., description="Avatar ID") + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "api_base_url": self.options.api_base_url, + "avatar_id": self.options.avatar_id, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_appid is not None: + params["agora_appid"] = self.options.agora_appid + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.agora_channel is not None: + params["agora_channel"] = self.options.agora_channel + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "generic", "params": params} + + + class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anam API key") + persona_id: Optional[str] = Field(default=None, description="Anam persona ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + class AnamAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AnamAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.persona_id is not None: + params["persona_id"] = self.options.persona_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "anam", "params": params} + src/agora_agent/agentkit/vendors/mllm.py: | + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + + from ...agents.types.start_agents_request_properties_mllm_turn_detection import ( + StartAgentsRequestPropertiesMllmTurnDetection, + ) + from .base import BaseMLLM + + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + + + class OpenAIRealtimeOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="OpenAI API key") + model: Optional[str] = Field(default=None, description="Model name (e.g., gpt-4o-realtime-preview)") + url: Optional[str] = Field(default=None, description="WebSocket URL") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIRealtimeOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config: Dict[str, Any] = { + "vendor": "openai", + "api_key": self.options.api_key, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.model is not None: + params = {"model": self.options.model} + if self.options.params is not None: + params.update(self.options.params) + config["params"] = params + elif self.options.params is not None: + config["params"] = self.options.params + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + # xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + # is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + class XaiGrokOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="xAI API key") + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + class XaiGrok(BaseMLLM): + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + def __init__(self, **kwargs: Any): + self.options = XaiGrokOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.language is not None: + params["language"] = self.options.language + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate + + config: Dict[str, Any] = { + "vendor": "xai", + "api_key": self.options.api_key, + "url": self.options.url, + "params": params, + } + + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + model: str = Field(..., description="Model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud location/region") + adc_credentials_string: str = Field(..., description="Application Default Credentials JSON string") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name (e.g., Aoede, Charon)") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # additional_params spread first so that explicit fields always win, + # matching the TypeScript SDK. + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["model"] = self.options.model + params["project_id"] = self.options.project_id + params["location"] = self.options.location + params["adc_credentials_string"] = self.options.adc_credentials_string + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "vertexai", + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class GeminiLiveOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google API key") + model: str = Field(..., description="Gemini Live model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiLiveOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = {} + if self.options.additional_params is not None: + params.update(self.options.additional_params) + params["model"] = self.options.model + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "gemini", + "api_key": self.options.api_key, + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + status: unresolved + - id: patch-20245632 + content_hash: sha256:a22e4a3b114ba8105c8129ccd6222570dc1f231daf9ac6037a00bcd4e11c425b + original_commit: 20245632afd066efe5a453665b29c5ba0e13e4f8 + original_message: "feat(agentkit): export type aliases and avatar token helpers" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/__init__.py + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + - src/agora_agent/agentkit/avatar_types.py + - src/agora_agent/agentkit/constants.py + patch_content: |+ + From 20245632afd066efe5a453665b29c5ba0e13e4f8 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Thu, 21 May 2026 15:17:27 -0400 + Subject: [PATCH] feat(agentkit): export type aliases and avatar token helpers + + --- + src/agora_agent/agentkit/__init__.py | 49 ++++++++++++++++------- + src/agora_agent/agentkit/agent.py | 22 +++++++++- + src/agora_agent/agentkit/agent_session.py | 8 +++- + src/agora_agent/agentkit/avatar_types.py | 23 +++++++++-- + src/agora_agent/agentkit/constants.py | 10 +++++ + 5 files changed, 90 insertions(+), 22 deletions(-) + + diff --git a/src/agora_agent/agentkit/__init__.py b/src/agora_agent/agentkit/__init__.py + index 5ceda66..e9ab221 100644 + --- a/src/agora_agent/agentkit/__init__.py + +++ b/src/agora_agent/agentkit/__init__.py + @@ -2,6 +2,7 @@ from .agent import ( + Agent, + AgentConfig, + AgentConfigUpdate, + + AsrConfig, + ConversationHistory, + ConversationRole, + ConversationSessionTurn, + @@ -62,23 +63,23 @@ from .agent import ( + SessionListResponse, + SessionSummary, + SpeakPriority, + + ThinkOnListeningAction, + + ThinkOnSpeakingAction, + + ThinkOnThinkingAction, + + ThinkResponse, + ) + -from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + -from ..agent_management.types.agent_think_agent_management_response import ( + - AgentThinkAgentManagementResponse as AgentThinkResponse, + -) + -from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + - AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + -) + -from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + - AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + -) + -from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + - AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + +# Deprecated think type aliases (prefer ThinkOn* names). + +from .agent import ( + + ThinkOnListeningAction as AgentThinkRequestOnListeningAction, + + ThinkOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + + ThinkOnThinkingAction as AgentThinkRequestOnThinkingAction, + + ThinkResponse as AgentThinkResponse, + ) + +from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + @@ -94,6 +95,13 @@ from .constants import ( + GeofenceArea, + GeofenceExcludeArea, + FillerWordsSelectionRule, + + ThinkOnListeningActionIgnore, + + ThinkOnListeningActionInject, + + ThinkOnListeningActionInterrupt, + + ThinkOnSpeakingActionIgnore, + + ThinkOnSpeakingActionInterrupt, + + ThinkOnThinkingActionIgnore, + + ThinkOnThinkingActionInterrupt, + TurnDetectionTypeValues, + ) + from .token import ( + @@ -158,7 +166,6 @@ from .vendors import ( + SpeechmaticsSTT, + VertexAI, + XaiGrok, + - XaiRealtime, + LiveAvatarAvatar, + ) + + @@ -172,6 +179,7 @@ __all__ = [ + "LlmConfig", + "LlmStyle", + "SttConfig", + + "AsrConfig", + "SttVendor", + "TtsConfig", + "MllmConfig", + @@ -230,6 +238,13 @@ __all__ = [ + "GeofenceExcludeArea", + "FillerWordsSelectionRule", + "TurnDetectionTypeValues", + + "ThinkOnListeningActionInject", + + "ThinkOnListeningActionInterrupt", + + "ThinkOnListeningActionIgnore", + + "ThinkOnThinkingActionInterrupt", + + "ThinkOnThinkingActionIgnore", + + "ThinkOnSpeakingActionInterrupt", + + "ThinkOnSpeakingActionIgnore", + # LLM sub-types + "LlmGreetingConfigs", + "LlmGreetingConfigsMode", + @@ -246,10 +261,16 @@ __all__ = [ + "ConversationTurns", + "ConversationSessionTurn", + "SpeakPriority", + + "ThinkResponse", + + "ThinkOnListeningAction", + + "ThinkOnThinkingAction", + + "ThinkOnSpeakingAction", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + "AgentThinkRequestOnSpeakingAction", + + "is_avatar_token_managed", + + "is_rtc_avatar", + "AgentPresets", + "DeepgramPresetModels", + "OpenAIPresetModels", + @@ -303,7 +324,6 @@ __all__ = [ + "GeminiLive", + "VertexAI", + "XaiGrok", + - "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + @@ -314,7 +334,6 @@ __all__ = [ + "is_akool_avatar", + "is_anam_avatar", + "is_generic_avatar", + - "is_rtc_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index 86a958e..14933a2 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -66,13 +66,25 @@ from ..agents.types.start_agents_request_properties_filler_words_content import + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + -from .token import generate_convo_ai_token, _validate_expires_in + +from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + + AgentThinkAgentManagementRequestOnListeningAction, + +) + +from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + + AgentThinkAgentManagementRequestOnThinkingAction, + +) + +from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + + AgentThinkAgentManagementRequestOnSpeakingAction, + +) + +from ..agent_management.types.agent_think_agent_management_response import ( + + AgentThinkAgentManagementResponse, + +) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + +AsrConfig = SttConfig + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + @@ -159,6 +171,14 @@ FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + +# Think type aliases and response + +ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + +ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + +ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + +ThinkResponse = AgentThinkAgentManagementResponse + + + +from .token import generate_convo_ai_token, _validate_expires_in + + + + class Agent: + """A reusable agent definition. + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index e41a399..269619e 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -20,10 +20,10 @@ from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + - is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + @@ -242,7 +242,11 @@ class _AgentSessionBase: + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + - if not is_rtc_avatar(avatar): + + if not is_avatar_token_managed(avatar): + + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + + return + + + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + diff --git a/src/agora_agent/agentkit/avatar_types.py b/src/agora_agent/agentkit/avatar_types.py + index a04809c..aea9da1 100644 + --- a/src/agora_agent/agentkit/avatar_types.py + +++ b/src/agora_agent/agentkit/avatar_types.py + @@ -1,3 +1,4 @@ + +import warnings + import typing + + + @@ -21,11 +22,25 @@ def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "generic" + + + +def is_avatar_token_managed(config: typing.Dict[str, typing.Any]) -> bool: + + """Return True when AgentKit manages the avatar RTC publisher identity.""" + + return ( + + is_heygen_avatar(config) + + or is_live_avatar_avatar(config) + + or is_generic_avatar(config) + + ) + + + + + def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + - params = config.get("params", {}) + - return isinstance(params, dict) and bool(params.get("agora_uid")) and ( + - is_heygen_avatar(config) or is_live_avatar_avatar(config) or is_generic_avatar(config) + + """Deprecated: use :func:`is_avatar_token_managed` for vendor gating.""" + + warnings.warn( + + "is_rtc_avatar is deprecated; use is_avatar_token_managed for vendor gating " + + "and keep agora_uid checks in session enrichment.", + + DeprecationWarning, + + stacklevel=2, + ) + + params = config.get("params", {}) + + return isinstance(params, dict) and bool(params.get("agora_uid")) and is_avatar_token_managed(config) + + + def validate_avatar_config( + @@ -95,7 +110,7 @@ def validate_tts_sample_rate( + """Validates that TTS sample rate is compatible with the avatar vendor. + + Different avatar vendors have specific sample rate requirements: + - - HeyGen: ONLY supports 24,000 Hz + + - HeyGen/LiveAvatar: ONLY supports 24,000 Hz + - Akool: ONLY supports 16,000 Hz + + Parameters + diff --git a/src/agora_agent/agentkit/constants.py b/src/agora_agent/agentkit/constants.py + index f86e4d3..c0a852e 100644 + --- a/src/agora_agent/agentkit/constants.py + +++ b/src/agora_agent/agentkit/constants.py + @@ -58,3 +58,13 @@ class TurnDetectionTypeValues: + AGORA_VAD = "agora_vad" + SERVER_VAD = "server_vad" + SEMANTIC_VAD = "semantic_vad" + + + + + +# Think action value constants (match Fern wire values) + +ThinkOnListeningActionInject = "inject" + +ThinkOnListeningActionInterrupt = "interrupt" + +ThinkOnListeningActionIgnore = "ignore" + +ThinkOnThinkingActionInterrupt = "interrupt" + +ThinkOnThinkingActionIgnore = "ignore" + +ThinkOnSpeakingActionInterrupt = "interrupt" + +ThinkOnSpeakingActionIgnore = "ignore" + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/__init__.py: | + from .agent import ( + Agent, + AgentConfig, + AgentConfigUpdate, + AsrConfig, + ConversationHistory, + ConversationRole, + ConversationSessionTurn, + ConversationTurn, + ConversationTurns, + StartAgentsRequestProperties, + AvatarConfig, + AvatarVendor, + GeofenceConfig, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, + RtcConfig, + SttConfig, + SttVendor, + TtsConfig, + FillerWordsConfig, + FillerWordsTrigger, + FillerWordsTriggerFixedTimeConfig, + FillerWordsContent, + FillerWordsContentStaticConfig, + FillerWordsContentSelectionRule, + TurnDetectionConfig, + TurnDetectionNestedConfig, + StartOfSpeechConfig, + StartOfSpeechMode, + StartOfSpeechVadConfig, + StartOfSpeechKeywordsConfig, + StartOfSpeechDisabledConfig, + StartOfSpeechDisabledConfigStrategy, + EndOfSpeechConfig, + EndOfSpeechMode, + EndOfSpeechVadConfig, + EndOfSpeechSemanticConfig, + TurnDetectionType, + InterruptMode, + Eagerness, + SalConfig, + SalMode, + AdvancedFeatures, + SessionParams, + SessionParamsInput, + SilenceConfig, + SilenceAction, + FarewellConfig, + ParametersDataChannel, + ParametersAudioScenario, + InterruptionConfig, + InterruptionMode, + MllmTurnDetectionConfig, + MllmTurnDetectionMode, + Labels, + LlmGreetingConfigs, + LlmGreetingConfigsMode, + McpServersItem, + SessionInfo, + SessionListResponse, + SessionSummary, + SpeakPriority, + ThinkOnListeningAction, + ThinkOnSpeakingAction, + ThinkOnThinkingAction, + ThinkResponse, + ) + # Deprecated think type aliases (prefer ThinkOn* names). + from .agent import ( + ThinkOnListeningAction as AgentThinkRequestOnListeningAction, + ThinkOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ThinkOnThinkingAction as AgentThinkRequestOnThinkingAction, + ThinkResponse as AgentThinkResponse, + ) + from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .constants import ( + DataChannel, + AudioScenario, + SilenceActionValues, + SalModeValues, + GeofenceArea, + GeofenceExcludeArea, + FillerWordsSelectionRule, + ThinkOnListeningActionIgnore, + ThinkOnListeningActionInject, + ThinkOnListeningActionInterrupt, + ThinkOnSpeakingActionIgnore, + ThinkOnSpeakingActionInterrupt, + ThinkOnThinkingActionIgnore, + ThinkOnThinkingActionInterrupt, + TurnDetectionTypeValues, + ) + from .token import ( + GenerateConvoAITokenOptions, + GenerateTokenOptions, + MAX_EXPIRY_SECONDS, + generate_convo_ai_token, + generate_rtc_token, + expires_in_hours, + expires_in_minutes, + ) + from .presets import ( + AgentPresets, + DeepgramPresetModels, + MiniMaxPresetModels, + OpenAIPresetModels, + OpenAITtsPresetModels, + normalize_preset_input, + ) + from .vendors import ( + AkoolAvatar, + AmazonSTT, + AmazonTTS, + AnamAvatar, + Anthropic, + AresSTT, + AssemblyAISTT, + AzureOpenAI, + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + CartesiaTTS, + DeepgramSTT, + DeepgramTTS, + ElevenLabsSampleRate, + ElevenLabsTTS, + FishAudioTTS, + Gemini, + GeminiLive, + GenericAvatar, + GoogleSTT, + GoogleTTS, + HeyGenAvatar, + HumeAITTS, + MicrosoftSampleRate, + MicrosoftSTT, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAI, + OpenAIRealtime, + OpenAISampleRate, + OpenAISTT, + OpenAITTS, + RimeTTS, + SampleRate, + SarvamSTT, + SarvamTTS, + SpeechmaticsSTT, + VertexAI, + XaiGrok, + LiveAvatarAvatar, + ) + + __all__ = [ + "Agent", + "AgentConfig", + "AgentConfigUpdate", + # Return type of Agent.to_properties() + "StartAgentsRequestProperties", + # Top-level config types + "LlmConfig", + "LlmStyle", + "SttConfig", + "AsrConfig", + "SttVendor", + "TtsConfig", + "MllmConfig", + "MllmVendor", + "AvatarConfig", + "AvatarVendor", + "GeofenceConfig", + "RtcConfig", + "FillerWordsConfig", + "FillerWordsTrigger", + "FillerWordsTriggerFixedTimeConfig", + "FillerWordsContent", + "FillerWordsContentStaticConfig", + "FillerWordsContentSelectionRule", + # Turn detection types + "TurnDetectionConfig", + "TurnDetectionNestedConfig", + "StartOfSpeechConfig", + "StartOfSpeechMode", + "StartOfSpeechVadConfig", + "StartOfSpeechKeywordsConfig", + "StartOfSpeechDisabledConfig", + "StartOfSpeechDisabledConfigStrategy", + "EndOfSpeechConfig", + "EndOfSpeechMode", + "EndOfSpeechVadConfig", + "EndOfSpeechSemanticConfig", + # Deprecated turn detection types + "TurnDetectionType", + "InterruptMode", + "Eagerness", + # SAL types + "SalConfig", + "SalMode", + # Advanced features + "AdvancedFeatures", + # Session parameters types + "SessionParams", + "SessionParamsInput", + "SilenceConfig", + "SilenceAction", + "FarewellConfig", + "ParametersDataChannel", + "ParametersAudioScenario", + "InterruptionConfig", + "InterruptionMode", + "MllmTurnDetectionConfig", + "MllmTurnDetectionMode", + "Labels", + # Type-safe constants + "DataChannel", + "AudioScenario", + "SilenceActionValues", + "SalModeValues", + "GeofenceArea", + "GeofenceExcludeArea", + "FillerWordsSelectionRule", + "TurnDetectionTypeValues", + "ThinkOnListeningActionInject", + "ThinkOnListeningActionInterrupt", + "ThinkOnListeningActionIgnore", + "ThinkOnThinkingActionInterrupt", + "ThinkOnThinkingActionIgnore", + "ThinkOnSpeakingActionInterrupt", + "ThinkOnSpeakingActionIgnore", + # LLM sub-types + "LlmGreetingConfigs", + "LlmGreetingConfigsMode", + "McpServersItem", + "AgentSession", + "AsyncAgentSession", + "AgentSessionOptions", + "SessionInfo", + "SessionListResponse", + "SessionSummary", + "ConversationHistory", + "ConversationTurn", + "ConversationRole", + "ConversationTurns", + "ConversationSessionTurn", + "SpeakPriority", + "ThinkResponse", + "ThinkOnListeningAction", + "ThinkOnThinkingAction", + "ThinkOnSpeakingAction", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + "AgentThinkRequestOnSpeakingAction", + "is_avatar_token_managed", + "is_rtc_avatar", + "AgentPresets", + "DeepgramPresetModels", + "OpenAIPresetModels", + "OpenAITtsPresetModels", + "MiniMaxPresetModels", + "normalize_preset_input", + "generate_rtc_token", + "GenerateTokenOptions", + "generate_convo_ai_token", + "GenerateConvoAITokenOptions", + "MAX_EXPIRY_SECONDS", + "expires_in_hours", + "expires_in_minutes", + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + "is_heygen_avatar", + "is_live_avatar_avatar", + "is_akool_avatar", + "is_anam_avatar", + "is_generic_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection + from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode + from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs + from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode + from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse, + ) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + AsrConfig = SttConfig + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + MllmVendor = StartAgentsRequestPropertiesMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + # LLM sub-type aliases + LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs + LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode + McpServersItem = StartAgentsRequestPropertiesLlmMcpServersItem + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + # Think type aliases and response + ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + ThinkResponse = AgentThinkAgentManagementResponse + + from .token import generate_convo_ai_token, _validate_expires_in + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Examples + -------- + >>> from agora_agent.agentkit import Agent + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + ): + self._name = name + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + The failure message is played via TTS when the LLM call fails. + """ + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Returns a new Agent with the specified maximum conversation history length.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + account=agent_uid, + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + if self._stt is not None: + base_kwargs["asr"] = self._stt + + return StartAgentsRequestProperties(**base_kwargs) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + account=self._agent_uid, + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + account=str(params["agora_uid"]), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + src/agora_agent/agentkit/avatar_types.py: | + import warnings + import typing + + + def is_heygen_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "heygen" + + + def is_live_avatar_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "liveavatar" + + + def is_akool_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "akool" + + + def is_anam_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "anam" + + + def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "generic" + + + def is_avatar_token_managed(config: typing.Dict[str, typing.Any]) -> bool: + """Return True when AgentKit manages the avatar RTC publisher identity.""" + return ( + is_heygen_avatar(config) + or is_live_avatar_avatar(config) + or is_generic_avatar(config) + ) + + + def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + """Deprecated: use :func:`is_avatar_token_managed` for vendor gating.""" + warnings.warn( + "is_rtc_avatar is deprecated; use is_avatar_token_managed for vendor gating " + "and keep agora_uid checks in session enrichment.", + DeprecationWarning, + stacklevel=2, + ) + params = config.get("params", {}) + return isinstance(params, dict) and bool(params.get("agora_uid")) and is_avatar_token_managed(config) + + + def validate_avatar_config( + config: typing.Dict[str, typing.Any], + require_session_fields: bool = False, + ) -> None: + """Validates avatar configuration at runtime. + + Parameters + ---------- + config : dict + The avatar configuration dictionary. + + Raises + ------ + ValueError + If the configuration is invalid. + """ + if is_heygen_avatar(config) or is_live_avatar_avatar(config): + label = "HeyGen" if is_heygen_avatar(config) else "LiveAvatar" + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError(f"{label} avatar requires api_key") + if not params.get("quality"): + raise ValueError(f"{label} avatar requires quality (low, medium, or high)") + if not params.get("agora_uid"): + raise ValueError(f"{label} avatar requires agora_uid") + valid_qualities = ("low", "medium", "high") + if params.get("quality") not in valid_qualities: + raise ValueError( + f"Invalid quality for {label}: {params.get('quality')}. " + f"Must be one of: {', '.join(valid_qualities)}" + ) + if require_session_fields and not params.get("agora_token"): + raise ValueError(f"{label} avatar requires agora_token after session enrichment") + elif is_akool_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Akool avatar requires api_key") + elif is_anam_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Anam avatar requires api_key") + elif is_generic_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Generic avatar requires api_key") + if not params.get("api_base_url"): + raise ValueError("Generic avatar requires api_base_url") + if not params.get("avatar_id"): + raise ValueError("Generic avatar requires avatar_id") + if not params.get("agora_uid"): + raise ValueError("Generic avatar requires agora_uid") + if require_session_fields: + if not params.get("agora_token"): + raise ValueError("Generic avatar requires agora_token after session enrichment") + if not params.get("agora_appid"): + raise ValueError("Generic avatar requires agora_appid after session enrichment") + if not params.get("agora_channel"): + raise ValueError("Generic avatar requires agora_channel after session enrichment") + + + def validate_tts_sample_rate( + avatar_config: typing.Dict[str, typing.Any], + tts_sample_rate: int, + ) -> None: + """Validates that TTS sample rate is compatible with the avatar vendor. + + Different avatar vendors have specific sample rate requirements: + - HeyGen/LiveAvatar: ONLY supports 24,000 Hz + - Akool: ONLY supports 16,000 Hz + + Parameters + ---------- + avatar_config : dict + The avatar configuration dictionary. + tts_sample_rate : int + The sample rate from your TTS configuration (in Hz). + + Raises + ------ + ValueError + If TTS sample rate is incompatible with the avatar vendor. + """ + if is_heygen_avatar(avatar_config) or is_live_avatar_avatar(avatar_config): + if tts_sample_rate != 24000: + label = "HeyGen" if is_heygen_avatar(avatar_config) else "LiveAvatar" + raise ValueError( + f"{label} avatars ONLY support 24,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 24kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/overview" + ) + elif is_akool_avatar(avatar_config): + if tts_sample_rate != 16000: + raise ValueError( + f"Akool avatars ONLY support 16,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 16kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/akool" + ) + src/agora_agent/agentkit/constants.py: | + """ + Type-safe constants for agent configuration values. + Use these instead of raw strings to avoid typos and get IDE autocomplete. + """ + + # Data channel: "rtm" | "datastream" + class DataChannel: + RTM = "rtm" + DATASTREAM = "datastream" + + class AudioScenario: + DEFAULT = "default" + CHORUS = "chorus" + AISERVER = "aiserver" + + + # Silence action when timeout elapses: "speak" | "think" + # (Use for parameters.silence_config.action — avoids shadowing SilenceAction type) + class SilenceActionValues: + SPEAK = "speak" + THINK = "think" + + + # SAL mode: "locking" | "recognition" + # (Use for sal.sal_mode — avoids shadowing SalMode type) + class SalModeValues: + LOCKING = "locking" + RECOGNITION = "recognition" + + + # Geofence area: "GLOBAL" | "NORTH_AMERICA" | "EUROPE" | "ASIA" | "INDIA" | "JAPAN" + class GeofenceArea: + GLOBAL = "GLOBAL" + NORTH_AMERICA = "NORTH_AMERICA" + EUROPE = "EUROPE" + ASIA = "ASIA" + INDIA = "INDIA" + JAPAN = "JAPAN" + + + # Geofence exclude area (when area is GLOBAL) + class GeofenceExcludeArea: + NORTH_AMERICA = "NORTH_AMERICA" + EUROPE = "EUROPE" + ASIA = "ASIA" + INDIA = "INDIA" + JAPAN = "JAPAN" + + + # Filler word selection rule: "shuffle" | "round_robin" + class FillerWordsSelectionRule: + SHUFFLE = "shuffle" + ROUND_ROBIN = "round_robin" + + + # Turn detection type (deprecated; use TurnDetectionNestedConfig.EndOfSpeech instead) + class TurnDetectionTypeValues: + AGORA_VAD = "agora_vad" + SERVER_VAD = "server_vad" + SEMANTIC_VAD = "semantic_vad" + + + # Think action value constants (match Fern wire values) + ThinkOnListeningActionInject = "inject" + ThinkOnListeningActionInterrupt = "interrupt" + ThinkOnListeningActionIgnore = "ignore" + ThinkOnThinkingActionInterrupt = "interrupt" + ThinkOnThinkingActionIgnore = "ignore" + ThinkOnSpeakingActionInterrupt = "interrupt" + ThinkOnSpeakingActionIgnore = "ignore" + status: unresolved + - id: patch-972dd5bd + content_hash: sha256:10f86db20e0b5a3800efce4913b736ff338dee29eb18cb31e89658e0293b848e + original_commit: 972dd5bdafc09b3981ab2ce4e0d02beae165c626 + original_message: updated docs + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/reference/agent.md + patch_content: |+ + From 972dd5bdafc09b3981ab2ce4e0d02beae165c626 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Thu, 21 May 2026 16:13:35 -0400 + Subject: [PATCH] updated docs + + --- + docs/reference/agent.md | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + + diff --git a/docs/reference/agent.md b/docs/reference/agent.md + index 1e88b8b..3163f9c 100644 + --- a/docs/reference/agent.md + +++ b/docs/reference/agent.md + @@ -264,3 +264,18 @@ to_properties( + | `rtc` | `Optional[RtcConfig]` | RTC configuration | + | `filler_words` | `Optional[FillerWordsConfig]` | Filler words configuration | + | `config` | `Dict[str, Any]` | Full configuration dict | + + + +## Type aliases + + + +Public aliases over Fern-generated types: `LlmConfig`, `SttConfig`, `AsrConfig` (= `SttConfig`), `MllmConfig`, `AvatarConfig`, session/conversation types, and think types (`ThinkOnListeningAction`, etc.). + + + +Think value constants: `ThinkOnListeningActionInject`, `ThinkOnListeningActionInterrupt`, `ThinkOnListeningActionIgnore`, `ThinkOnThinkingActionInterrupt`, `ThinkOnThinkingActionIgnore`, `ThinkOnSpeakingActionInterrupt`, `ThinkOnSpeakingActionIgnore`. + + + +## Cross-SDK discovery map + + + +| Concept | Python | TypeScript | Go | + +|---|---|---|---| + +| STT payload alias (wire: `asr`) | `SttConfig` / `AsrConfig` | `SttConfig` / `AsrConfig` | `AsrConfig` / `SttConfig` | + +| xAI MLLM (primary) | `XaiGrok` | `XaiGrok` | `XaiGrok` / `NewXaiGrok` | + +| Avatar token helper | `is_avatar_token_managed` | `isAvatarTokenManaged` | `IsAvatarTokenManaged` | + +| Think inject constant | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | + -- + 2.52.0 + + theirs_snapshot: + docs/reference/agent.md: | + --- + sidebar_position: 2 + title: Agent + description: Full API reference for the Python Agent builder class. + --- + + # Agent Reference + + **Import:** `from agora_agent.agentkit import Agent` or `from agora_agent import Agent` + + ## Constructor + + + ```python + Agent( + name: Optional[str] = None, + instructions: Optional[str] = None, + turn_detection: Optional[TurnDetectionConfig] = None, + interruption: Optional[InterruptionConfig] = None, + sal: Optional[SalConfig] = None, + advanced_features: Optional[Dict[str, Any]] = None, + parameters: Optional[SessionParams] = None, + greeting: Optional[str] = None, + failure_message: Optional[str] = None, + max_history: Optional[int] = None, + geofence: Optional[GeofenceConfig] = None, + labels: Optional[Dict[str, str]] = None, + rtc: Optional[RtcConfig] = None, + filler_words: Optional[FillerWordsConfig] = None, + ) + ``` + + | Parameter | Type | Default | Description | + |---|---|---|---| + | `name` | `Optional[str]` | `None` | Agent name, used as default session name | + | `instructions` | `Optional[str]` | `None` | System prompt for the LLM | + | `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Turn detection configuration | + | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | + | `sal` | `Optional[SalConfig]` | `None` | Speech Activity Level configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | `None` | Advanced features dict (e.g., `{'enable_rtm': True}`) | + | `parameters` | `Optional[SessionParams]` | `None` | Additional session parameters | + | `greeting` | `Optional[str]` | `None` | Auto-spoken greeting when agent joins | + | `failure_message` | `Optional[str]` | `None` | Spoken on error | + | `max_history` | `Optional[int]` | `None` | Max conversation history length | + | `geofence` | `Optional[GeofenceConfig]` | `None` | Regional access restriction | + | `labels` | `Optional[Dict[str, str]]` | `None` | Custom key-value labels (returned in callbacks) | + | `rtc` | `Optional[RtcConfig]` | `None` | RTC media encryption | + | `filler_words` | `Optional[FillerWordsConfig]` | `None` | Filler words while waiting for LLM | + + ## Builder Methods + + All builder methods return a new `Agent` instance (immutable pattern). + + ### `with_llm(vendor: BaseLLM) -> Agent` + + Set the LLM vendor for cascading flow. + + + ```python + from agora_agent.agentkit.vendors import OpenAI + agent = Agent().with_llm(OpenAI(api_key='your-key', model='gpt-4o-mini')) + ``` + + ### `with_tts(vendor: BaseTTS) -> Agent` + + Set the TTS vendor. Records the vendor's `sample_rate` for avatar validation. + + + ```python + from agora_agent.agentkit.vendors import ElevenLabsTTS + agent = Agent().with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id')) + ``` + + ### `with_stt(vendor: BaseSTT) -> Agent` + + Set the STT (ASR) vendor. + + + ```python + from agora_agent.agentkit.vendors import DeepgramSTT + agent = Agent().with_stt(DeepgramSTT(api_key='your-key', language='en-US')) + ``` + + ### `with_mllm(vendor: BaseMLLM) -> Agent` + + Set the MLLM vendor for multimodal flow. Calling `with_mllm()` automatically sets `mllm.enable = True`. MLLM sessions do not require TTS, STT, or LLM vendors. + + + ```python + from agora_agent.agentkit.vendors import OpenAIRealtime + agent = Agent().with_mllm(OpenAIRealtime(api_key='your-key')) + ``` + + ### `with_avatar(vendor: BaseAvatar) -> Agent` + + Set the avatar vendor for the cascading ASR + LLM + TTS pipeline. Avatars are not supported when MLLM is enabled — combining `with_mllm()` and an enabled `with_avatar()` is rejected at `to_properties()` and `AgentSession.start()`. A disabled avatar (`enable=False`) is allowed alongside MLLM. + + Raises `ValueError` if the TTS sample rate does not match the avatar's `required_sample_rate`. + + + ```python + from agora_agent.agentkit.vendors import HeyGenAvatar + agent = agent.with_avatar(HeyGenAvatar(api_key='your-key', quality='medium', agora_uid='2')) + ``` + + **Raises:** `ValueError` — `"Avatar requires TTS sample rate of {required} Hz, but TTS is configured with {actual} Hz. Please update your TTS sample_rate to {required}."` + + ### `with_turn_detection(config: TurnDetectionConfig) -> Agent` + + Override cascading-flow turn detection settings. Use `config.start_of_speech` and `config.end_of_speech` for SOS/EOS detection. Use `with_interruption()` for interruption behavior and MLLM vendor `turn_detection` for MLLM turn detection. + + Pause-state detection is configured under semantic end-of-speech: + + ```python + agent = agent.with_turn_detection({ + "mode": "default", + "config": { + "end_of_speech": { + "mode": "semantic", + "semantic_config": { + "pause_state_enabled": True, + }, + }, + }, + }) + ``` + + ### `with_interruption(config: InterruptionConfig) -> Agent` + + Configure unified interruption behavior using the top-level `interruption` object. Use this for `start_of_speech` and `keywords` interruption modes. + + ### `with_instructions(instructions: str) -> Agent` + + Override the system prompt. + + ### `with_greeting(greeting: str) -> Agent` + + Override the greeting message. + + ### `with_name(name: str) -> Agent` + + Override the agent name. + + ### `with_sal(config: SalConfig) -> Agent` + + Set SAL (Selective Attention Locking) configuration. + + ### `with_advanced_features(features: AdvancedFeatures) -> Agent` + + Set advanced features (e.g. `{'enable_rtm': True}`). + + When `enable_rtm=True`, AgentKit defaults `parameters.data_channel` to `"rtm"` unless you explicitly set another data channel. + + ### `with_tools(enabled: bool = True) -> Agent` + + Enable or disable MCP tool invocation by setting `advanced_features.enable_tools`. + + ### `with_parameters(parameters: SessionParams) -> Agent` + + Set session parameters (silence config, farewell config, data channel, audio scenario, etc.). + + ### `with_audio_scenario(audio_scenario: ParametersAudioScenario) -> Agent` + + Set `parameters.audio_scenario` without replacing existing session parameters. + + ### `with_failure_message(message: str) -> Agent` + + Set the message spoken via TTS when the LLM call fails. + + ### `with_max_history(max_history: int) -> Agent` + + Set the maximum conversation history length for the standard ASR + LLM + TTS pipeline. The v2.7 MLLM core type does not expose `max_history`. + + ### `with_geofence(geofence: GeofenceConfig) -> Agent` + + Set geofence configuration (restricts backend server regions). + + ### `with_labels(labels: Dict[str, str]) -> Agent` + + Set custom labels (key-value pairs returned in notification callbacks). + + ### `with_rtc(rtc: RtcConfig) -> Agent` + + Set RTC configuration. + + ### `with_filler_words(filler_words: FillerWordsConfig) -> Agent` + + Set filler words configuration (played while waiting for LLM response). + + ## `create_session()` + + + ```python + create_session( + client: Any, + channel: str, + agent_uid: str, + remote_uids: List[str], + name: Optional[str] = None, + token: Optional[str] = None, + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + expires_in: Optional[int] = None, + ) -> AgentSession + ``` + + Creates an `AgentSession` bound to the given client and channel. + + | Parameter | Type | Required | Description | + |---|---|---|---| + | `client` | `Agora` or `AsyncAgora` | Yes | Authenticated client | + | `channel` | `str` | Yes | Channel name | + | `agent_uid` | `str` | Yes | UID for the agent | + | `remote_uids` | `List[str]` | Yes | UIDs of remote participants | + | `name` | `Optional[str]` | No | Session name (defaults to agent name) | + | `token` | `Optional[str]` | No | Pre-built RTC+RTM token | + | `expires_in` | `Optional[int]` | No | Token lifetime in seconds (default: `86400` = 24 h, Agora max). Only applies when the token is auto-generated. Use `expires_in_hours()` or `expires_in_minutes()` for clarity. Valid range: 1–86400. | + | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | + | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | + + **Returns:** `AgentSession` + + ## `to_properties()` + + Converts the agent configuration into a `StartAgentsRequestProperties` object for the Agora API. Called internally by `AgentSession.start()`. + + + ```python + to_properties( + channel: str, + agent_uid: str, + remote_uids: List[str], + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + token: Optional[str] = None, + app_id: Optional[str] = None, + app_certificate: Optional[str] = None, + expires_in: Optional[int] = None, + ) -> StartAgentsRequestProperties + ``` + + **Raises:** `ValueError` if neither `token` nor `app_id`+`app_certificate` is provided, or if required vendors (LLM, TTS) are missing in cascading mode. + + ## Properties + + | Property | Type | Description | + |---|---|---| + | `name` | `Optional[str]` | Agent name | + | `instructions` | `Optional[str]` | System prompt | + | `greeting` | `Optional[str]` | Greeting message | + | `failure_message` | `Optional[str]` | Message spoken when LLM fails | + | `max_history` | `Optional[int]` | Max conversation history length | + | `llm` | `Optional[Dict[str, Any]]` | LLM config dict (from `to_config()`) | + | `tts` | `Optional[Dict[str, Any]]` | TTS config dict | + | `stt` | `Optional[Dict[str, Any]]` | STT config dict | + | `mllm` | `Optional[Dict[str, Any]]` | MLLM config dict | + | `avatar` | `Optional[Dict[str, Any]]` | Avatar config dict | + | `turn_detection` | `Optional[TurnDetectionConfig]` | Turn detection settings | + | `sal` | `Optional[SalConfig]` | SAL configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | Advanced features | + | `parameters` | `Optional[SessionParams]` | Session parameters | + | `geofence` | `Optional[GeofenceConfig]` | Geofence configuration | + | `labels` | `Optional[Dict[str, str]]` | Custom labels | + | `rtc` | `Optional[RtcConfig]` | RTC configuration | + | `filler_words` | `Optional[FillerWordsConfig]` | Filler words configuration | + | `config` | `Dict[str, Any]` | Full configuration dict | + + ## Type aliases + + Public aliases over Fern-generated types: `LlmConfig`, `SttConfig`, `AsrConfig` (= `SttConfig`), `MllmConfig`, `AvatarConfig`, session/conversation types, and think types (`ThinkOnListeningAction`, etc.). + + Think value constants: `ThinkOnListeningActionInject`, `ThinkOnListeningActionInterrupt`, `ThinkOnListeningActionIgnore`, `ThinkOnThinkingActionInterrupt`, `ThinkOnThinkingActionIgnore`, `ThinkOnSpeakingActionInterrupt`, `ThinkOnSpeakingActionIgnore`. + + ## Cross-SDK discovery map + + | Concept | Python | TypeScript | Go | + |---|---|---|---| + | STT payload alias (wire: `asr`) | `SttConfig` / `AsrConfig` | `SttConfig` / `AsrConfig` | `AsrConfig` / `SttConfig` | + | xAI MLLM (primary) | `XaiGrok` | `XaiGrok` | `XaiGrok` / `NewXaiGrok` | + | Avatar token helper | `is_avatar_token_managed` | `isAvatarTokenManaged` | `IsAvatarTokenManaged` | + | Think inject constant | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | + status: unresolved + - id: patch-7465fada + content_hash: sha256:9c6ed2e5f48702293eed8b213cc31cce63a7ed5a1ad16a0b23e791c13e77746f + original_commit: 7465fadafa0f1e62051d99b42d0eeda85f31eeee + original_message: "fix(agentkit): resolve Python session typing issues" + original_author: digitallysavvy + base_generation: f652c69edbd1815c832fc9354c193090ac8dde8e + files: + - src/agora_agent/agentkit/agent_session.py + patch_content: | + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index a749d1e..ddcd930 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -24,6 +24,7 @@ from .avatar_types import ( + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + theirs_snapshot: + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent, GetTurnsOptions, SayOptions, ThinkOptions + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token, _parse_numeric_uid + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(self._agent_uid, "agent_uid"), + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(str(params["agora_uid"]), "avatar agora_uid"), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.greeting_configs is not None: + llm["greeting_configs"] = self._dump_model(self._agent.greeting_configs) + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @staticmethod + def _response_pagination(response: typing.Any) -> typing.Any: + if isinstance(response, dict): + return response.get("pagination") + return getattr(response, "pagination", None) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + - id: patch-d29165c4 + content_hash: sha256:be59d1d3efc435d5e0b83305b2cd39ce3dad4534a4125de18028c137e692e659 + original_commit: d29165c4ddd8296af703a4e9ed848516f563dd1b + original_message: make python compat package publishable + original_author: chenyuguo + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - compat/agora-agent-server-sdk/README.md + - compat/agora-agent-server-sdk/pyproject.toml + - compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + patch_content: |+ + From d29165c4ddd8296af703a4e9ed848516f563dd1b Mon Sep 17 00:00:00 2001 + From: chenyuguo + Date: Wed, 27 May 2026 17:24:50 +0800 + Subject: [PATCH] make python compat package publishable + + --- + compat/agora-agent-server-sdk/README.md | 2 ++ + compat/agora-agent-server-sdk/pyproject.toml | 3 +++ + .../src/agora_agent_server_sdk_compat/__init__.py | 1 + + 3 files changed, 6 insertions(+) + create mode 100644 compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + + diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md + index 1388836..cff3cfe 100644 + --- a/compat/agora-agent-server-sdk/README.md + +++ b/compat/agora-agent-server-sdk/README.md + @@ -9,3 +9,5 @@ pip install agora-agents + ``` + + This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. + + + +It intentionally contains only a minimal compatibility module so the distribution can be built and published cleanly with Poetry. + diff --git a/compat/agora-agent-server-sdk/pyproject.toml b/compat/agora-agent-server-sdk/pyproject.toml + index 8efbe53..ac93128 100644 + --- a/compat/agora-agent-server-sdk/pyproject.toml + +++ b/compat/agora-agent-server-sdk/pyproject.toml + @@ -26,6 +26,9 @@ classifiers = [ + "Topic :: Software Development :: Libraries :: Python Modules", + "Typing :: Typed" + ] + +packages = [ + + { include = "agora_agent_server_sdk_compat", from = "src"} + +] + + [tool.poetry.urls] + Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python' + diff --git a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + new file mode 100644 + index 0000000..55522c6 + --- /dev/null + +++ b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + @@ -0,0 +1 @@ + +"""Compatibility package for the renamed agora-agents distribution.""" + -- + 2.52.0 + + theirs_snapshot: + compat/agora-agent-server-sdk/README.md: | + # agora-agent-server-sdk + + This package has been renamed to `agora-agents`. + + New projects should install: + + ```sh + pip install agora-agents + ``` + + This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. + + It intentionally contains only a minimal compatibility module so the distribution can be built and published cleanly with Poetry. + compat/agora-agent-server-sdk/pyproject.toml: | + [project] + name = "agora-agent-server-sdk" + + [tool.poetry] + name = "agora-agent-server-sdk" + version = "v2.0.0" + description = "Compatibility shim for the renamed agora-agents package." + readme = "README.md" + authors = [] + keywords = [] + + classifiers = [ + "Intended Audience :: Developers", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Operating System :: OS Independent", + "Operating System :: POSIX", + "Operating System :: MacOS", + "Operating System :: POSIX :: Linux", + "Operating System :: Microsoft :: Windows", + "Topic :: Software Development :: Libraries :: Python Modules", + "Typing :: Typed" + ] + packages = [ + { include = "agora_agent_server_sdk_compat", from = "src"} + ] + + [tool.poetry.urls] + Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python' + + [tool.poetry.dependencies] + python = "^3.8" + agora-agents = ">=2.0.0,<3.0.0" + + [build-system] + requires = ["poetry-core"] + build-backend = "poetry.core.masonry.api" + compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py: | + """Compatibility package for the renamed agora-agents distribution.""" + status: unresolved + - id: patch-fae1249a + content_hash: sha256:01bf21f3cc4c784dfcff80a48c9c7bb3123af4327a567b7c990b528e9780e9a2 + original_commit: fae1249a20c53761a2eb5515a1bf92ca666760d1 + original_message: Re-export agora-agents API from legacy PyPI compatibility package The compat distribution delegates to agora_agent via __getattr__ and documents both import paths in its README. + original_author: digitallysavvy + base_generation: f652c69edbd1815c832fc9354c193090ac8dde8e + files: + - compat/agora-agent-server-sdk/README.md + - compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + patch_content: |+ + From fae1249a20c53761a2eb5515a1bf92ca666760d1 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 27 May 2026 16:58:18 -0400 + Subject: [PATCH] Re-export agora-agents API from legacy PyPI compatibility + package The compat distribution delegates to agora_agent via __getattr__ and + documents both import paths in its README. + + --- + compat/agora-agent-server-sdk/README.md | 7 +++++-- + .../src/agora_agent_server_sdk_compat/__init__.py | 14 +++++++++++++- + 2 files changed, 18 insertions(+), 3 deletions(-) + + diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md + index cff3cfe..e43d1d8 100644 + --- a/compat/agora-agent-server-sdk/README.md + +++ b/compat/agora-agent-server-sdk/README.md + @@ -8,6 +8,9 @@ New projects should install: + pip install agora-agents + ``` + + -This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. + +This compatibility package re-exports the public API from `agora-agents` to support existing installs during the migration window. The primary import path remains `agora_agent`; you can also import from `agora_agent_server_sdk_compat`: + + -It intentionally contains only a minimal compatibility module so the distribution can be built and published cleanly with Poetry. + +```python + +from agora_agent import Agora, Area + +from agora_agent_server_sdk_compat import Agora, Area + +``` + diff --git a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + index 55522c6..6283244 100644 + --- a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + +++ b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + @@ -1 +1,13 @@ + -"""Compatibility package for the renamed agora-agents distribution.""" + +"""Compatibility re-exports for the renamed agora-agents package.""" + + + +import agora_agent as _agora_agent + + + +__all__ = getattr(_agora_agent, "__all__", []) + + + + + +def __getattr__(name: str): + + return getattr(_agora_agent, name) + + + + + +def __dir__(): + + return dir(_agora_agent) + -- + 2.52.0 + + theirs_snapshot: + compat/agora-agent-server-sdk/README.md: | + # agora-agent-server-sdk + + This package has been renamed to `agora-agents`. + + New projects should install: + + ```sh + pip install agora-agents + ``` + + This compatibility package re-exports the public API from `agora-agents` to support existing installs during the migration window. The primary import path remains `agora_agent`; you can also import from `agora_agent_server_sdk_compat`: + + ```python + from agora_agent import Agora, Area + from agora_agent_server_sdk_compat import Agora, Area + ``` + compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py: | + """Compatibility re-exports for the renamed agora-agents package.""" + + import agora_agent as _agora_agent + + __all__ = getattr(_agora_agent, "__all__", []) + + + def __getattr__(name: str): + return getattr(_agora_agent, name) + + + def __dir__(): + return dir(_agora_agent) + user_owned: true + - id: patch-fc9d93c3 + content_hash: sha256:93877741bdad745fda5dd549d7c3dd6bc315f4574aabd2defb52c0c795bff011 + original_commit: fc9d93c3026a6109d8a5e8b386418592f8d121c5 + original_message: Document agora-agents PyPI install name and migration notes + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/getting-started/installation.md + patch_content: | + diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md + index c14bdb2..f6f1750 100644 + --- a/docs/getting-started/installation.md + +++ b/docs/getting-started/installation.md + @@ -13,13 +13,13 @@ description: Install the Agora Conversational AI Python SDK. + ## Install with pip + + ```sh + -pip install agora-agent-sdk + +pip install agora-agents + ``` + + ## Install with Poetry + + ```sh + -poetry add agora-agent-sdk + +poetry add agora-agents + ``` + + ## Dependencies + theirs_snapshot: + docs/getting-started/installation.md: | + --- + sidebar_position: 1 + title: Installation + description: Install the Agora Conversational AI Python SDK. + --- + + # Installation + + ## Prerequisites + + - Python >= 3.8 + + ## Install with pip + + ```sh + pip install agora-agents + ``` + + ## Install with Poetry + + ```sh + poetry add agora-agents + ``` + + ## Dependencies + + The following packages are installed automatically: + + | Package | Purpose | + |---|---| + | `httpx` (>= 0.21.2) | HTTP client for sync and async requests | + | `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | + | `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | + + ## Sync vs. Async + + The SDK supports both synchronous and asynchronous usage: + + - **Synchronous** — import `Agora` from `agora_agent` and use blocking method calls + - **Asynchronous** — import `AsyncAgora` and `AsyncAgentSession` from `agora_agent` and use `await` with all API calls + + ```python + # Sync + from agora_agent import Agora, Area + + # Async + from agora_agent import AsyncAgora, AsyncAgentSession, Area + ``` + + Both clients share the same constructor parameters and capabilities. See [Authentication](./authentication.md) for setup details. + status: unresolved + - id: patch-44c21c14 + content_hash: sha256:34f08060a06ca824943ab02e75c3c83ad43a1b6e7d09ec6f8fa244ef82de6fcd + original_commit: 44c21c14a14aa7ad469a18ce86024ff14ca2bf9b + original_message: Re-export AgentKit symbols from agora_agent package root Extend __getattr__ and __all__ so vendor classes, presets, and helpers are importable via `from agora_agent import ...`. Add tests and update class docstring examples to use the root import path. + original_author: digitallysavvy + base_generation: f652c69edbd1815c832fc9354c193090ac8dde8e + files: + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + - tests/custom/test_root_exports.py + patch_content: | + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index f84862c..0d7a4aa 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -67,6 +67,8 @@ from ..agents.types.start_agents_request_properties_filler_words_content import + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + +from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + +from ..types.tts import Tts + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index a749d1e..ddcd930 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -24,6 +24,7 @@ from .avatar_types import ( + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + theirs_snapshot: + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.list_agents_response_data_list_item_status import ListAgentsResponseDataListItemStatus + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection + from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode + from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs + from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode + from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse, + ) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + AsrConfig = SttConfig + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + MllmVendor = StartAgentsRequestPropertiesMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + SessionStatus = ListAgentsResponseDataListItemStatus + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + + class ThinkOptions(typing_extensions.TypedDict, total=False): + on_listening_action: AgentThinkAgentManagementRequestOnListeningAction + on_thinking_action: AgentThinkAgentManagementRequestOnThinkingAction + on_speaking_action: AgentThinkAgentManagementRequestOnSpeakingAction + interruptable: bool + metadata: typing.Dict[str, str] + + + class GetTurnsOptions(typing_extensions.TypedDict, total=False): + page_index: int + page_size: int + + + class SayOptions(typing_extensions.TypedDict, total=False): + priority: SpeakAgentsRequestPriority + interruptable: bool + + + class SessionOptions(typing_extensions.TypedDict, total=False): + name: str + channel: str + token: str + agent_uid: str + remote_uids: typing.List[str] + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + # LLM sub-type aliases + LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs + LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode + McpServersItem = StartAgentsRequestPropertiesLlmMcpServersItem + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + # Think type aliases and response + ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + ThinkResponse = AgentThinkAgentManagementResponse + + from .token import generate_convo_ai_token, _parse_numeric_uid, _validate_expires_in + + + def _dump_optional_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Examples + -------- + >>> from agora_agent import Agent, OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + greeting_configs: typing.Optional[LlmGreetingConfigs] = None, + ): + self._name = name + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + self._greeting_configs = greeting_configs + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_greeting_configs(self, configs: LlmGreetingConfigs) -> "Agent": + """Returns a new Agent with greeting playback configuration.""" + new_agent = self._clone() + new_agent._greeting_configs = configs + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + The failure message is played via TTS when the LLM call fails. + """ + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Returns a new Agent with the specified maximum conversation history length.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def greeting_configs(self) -> typing.Optional[LlmGreetingConfigs]: + return self._greeting_configs + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + "greeting_configs": self._greeting_configs, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + uid=_parse_numeric_uid(agent_uid, "agent_uid"), + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._greeting_configs is not None: + llm_config["greeting_configs"] = _dump_optional_model(self._greeting_configs) + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + if self._stt is not None: + base_kwargs["asr"] = self._stt + + return StartAgentsRequestProperties(**base_kwargs) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + new_agent._greeting_configs = self._greeting_configs + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent, GetTurnsOptions, SayOptions, ThinkOptions + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token, _parse_numeric_uid + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(self._agent_uid, "agent_uid"), + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(str(params["agora_uid"]), "avatar agora_uid"), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.greeting_configs is not None: + llm["greeting_configs"] = self._dump_model(self._agent.greeting_configs) + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @staticmethod + def _response_pagination(response: typing.Any) -> typing.Any: + if isinstance(response, dict): + return response.get("pagination") + return getattr(response, "pagination", None) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + tests/custom/test_root_exports.py: | + import pytest + + import agora_agent + import agora_agent.agentkit as agentkit + + + def test_root_exports_match_agentkit_for_common_symbols() -> None: + for name in ("Agent", "DeepgramSTT", "OpenAI", "AgentPresets", "generate_rtc_token", "DataChannel"): + assert getattr(agora_agent, name) is getattr(agentkit, name) + + + def test_root_exports_fern_client_symbols() -> None: + assert agora_agent.Agora is not None + assert agora_agent.Area is not None + assert agora_agent.AsyncAgora is not None + + + def test_unknown_root_export_raises_attribute_error() -> None: + with pytest.raises(AttributeError): + _ = agora_agent.NotARealExportName + + + def test_dir_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in dir(agora_agent) + + + def test_all_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in agora_agent.__all__ + assert "OpenAI" in agora_agent.__all__ + - id: patch-d475306b + content_hash: sha256:407af5e7564d6e8d0b91f1e117cb433aec931f083225af53c6df2abfff281b22 + original_commit: d475306bd42279984bcf4934b900003e8e02c4eb + original_message: Move package rename guidance to installation docs and protect manual paths in Fern ignore. Consolidate migration notes into the installation guide with next-step links, add a brief README pointer, and exclude README, compat, and workflow files from Fern generation. + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - compat/agora-agent-server-sdk/README.md + - docs/getting-started/installation.md + patch_content: | + diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md + index e43d1d8..1da36aa 100644 + --- a/compat/agora-agent-server-sdk/README.md + +++ b/compat/agora-agent-server-sdk/README.md + @@ -14,3 +14,5 @@ This compatibility package re-exports the public API from `agora-agents` to supp + from agora_agent import Agora, Area + from agora_agent_server_sdk_compat import Agora, Area + ``` + + + +Maintainers: dual-publish steps live in the repository release workflow, not in the root README. + diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md + index 04b48da..8fca9ab 100644 + --- a/docs/getting-started/installation.md + +++ b/docs/getting-started/installation.md + @@ -53,4 +53,15 @@ from agora_agent import AsyncAgora, AsyncAgentSession, Area + | `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | + | `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | + + -See [Authentication](./authentication.md) for setup details. + +## Next steps + + + +- [Authentication](./authentication.md) — configure your credentials + +- [Quick Start](./quick-start.md) — build your first conversational agent + + + +## Migrating from a previous package name + + + +The PyPI distribution was renamed from `agora-agent-server-sdk` to `agora-agents` in v2.0.0. Install `agora-agents`; the import path remains `agora_agent`. + + + +The legacy PyPI name remains available as a compatibility shim that re-exports `agora-agents`. See [compat/agora-agent-server-sdk](../../compat/agora-agent-server-sdk/README.md). + + + +For release and version details, see [changelog — Migration notes](../../changelog.md#migration-notes). + theirs_snapshot: + compat/agora-agent-server-sdk/README.md: | + # agora-agent-server-sdk + + This package has been renamed to `agora-agents`. + + New projects should install: + + ```sh + pip install agora-agents + ``` + + This compatibility package re-exports the public API from `agora-agents` to support existing installs during the migration window. The primary import path remains `agora_agent`; you can also import from `agora_agent_server_sdk_compat`: + + ```python + from agora_agent import Agora, Area + from agora_agent_server_sdk_compat import Agora, Area + ``` + + Maintainers: dual-publish steps live in the repository release workflow, not in the root README. + docs/getting-started/installation.md: | + --- + sidebar_position: 1 + title: Installation + description: Install the Agora Conversational AI Python SDK. + --- + + # Installation + + ## Prerequisites + + - Python >= 3.8 + + ## Install with pip + + ```sh + pip install agora-agents + ``` + + ## Install with Poetry + + ```sh + poetry add agora-agents + ``` + + ## Imports + + ```python + from agora_agent import Agent, Agora, Area, DeepgramSTT, OpenAI + ``` + + The package installs as `agora-agents` and imports as `agora_agent`. + + ## Sync vs. Async + + The SDK supports both synchronous and asynchronous usage: + + - **Synchronous** — import `Agora` from `agora_agent` and use blocking method calls + - **Asynchronous** — import `AsyncAgora` and `AsyncAgentSession` from `agora_agent` and use `await` with all API calls + + ```python + # Sync + from agora_agent import Agora, Area + + # Async + from agora_agent import AsyncAgora, AsyncAgentSession, Area + ``` + + ## Dependencies + + | Package | Purpose | + | ------------------------------ | ------------------------------------------------------ | + | `httpx` (>= 0.21.2) | HTTP client for sync and async requests | + | `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | + | `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | + + ## Next steps + + - [Authentication](./authentication.md) — configure your credentials + - [Quick Start](./quick-start.md) — build your first conversational agent + + ## Migrating from a previous package name + + The PyPI distribution was renamed from `agora-agent-server-sdk` to `agora-agents` in v2.0.0. Install `agora-agents`; the import path remains `agora_agent`. + + The legacy PyPI name remains available as a compatibility shim that re-exports `agora-agents`. See [compat/agora-agent-server-sdk](../../compat/agora-agent-server-sdk/README.md). + + For release and version details, see [changelog — Migration notes](../../changelog.md#migration-notes). + status: unresolved + - id: patch-c9355576 + content_hash: sha256:83b3b6148b21f2b4d53ee67321777522f5f4e871b61ea3b23f3a6b88ca052769 + original_commit: c93555763ffd63267a737b3e430217a890f203db + original_message: Streamline Python docs and README for app-credentials-first onboarding. Remove duplicated low-level client examples from the README, de-emphasize legacy auth modes, refocus the low-level API guide on AgentKit with telephony escape hatches, and update Agora-managed model terminology. + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/getting-started/authentication.md + - docs/guides/low-level-api.md + patch_content: | + diff --git a/docs/getting-started/authentication.md b/docs/getting-started/authentication.md + index 31dcc56..74c62cd 100644 + --- a/docs/getting-started/authentication.md + +++ b/docs/getting-started/authentication.md + @@ -46,41 +46,6 @@ session = agent.create_session( + print(client.auth_mode) # "app-credentials" + ``` + + -## Other auth modes + +## Legacy auth modes + + -The SDK also supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. These are not recommended for new applications. + - + -### Token auth (`auth_token`) + - + -Pass a pre-minted Agora REST token on the client. You must also supply the RTC join token on `create_session(..., token=...)`. + - + -```python + -client = Agora( + - area=Area.US, + - app_id="your-app-id", + - app_certificate="your-app-certificate", + - auth_token="your-rest-auth-token", + -) + - + -session = agent.create_session( + - client, + - channel="room-123", + - agent_uid="1", + - remote_uids=["100"], + - token="your-rtc-join-token", + -) + -``` + - + -### Basic Auth (`customer_id` + `customer_secret`) + - + -Uses HTTP Basic Auth with Customer ID and Secret from Agora Console. Avoid for new integrations — the same credentials are sent on every request instead of minting fresh tokens. + - + -```python + -client = Agora( + - area=Area.US, + - app_id="your-app-id", + - app_certificate="your-app-certificate", + - customer_id="your-customer-id", + - customer_secret="your-customer-secret", + -) + -``` + +The generated client still supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. Do not use those modes for new session integrations. Use app credentials so AgentKit can mint short-lived ConvoAI REST auth and RTC join tokens for each session. + diff --git a/docs/guides/low-level-api.md b/docs/guides/low-level-api.md + index 6677b45..47397b7 100644 + --- a/docs/guides/low-level-api.md + +++ b/docs/guides/low-level-api.md + @@ -1,187 +1,55 @@ + --- + sidebar_position: 10 + title: Low-Level API + -description: Direct client.agents.start() usage without the builder pattern. + +description: Use generated clients for escape-hatch APIs while keeping agent sessions on AgentKit. + --- + + # Low-Level API + + -For full control over request payloads you can call the generated clients directly and pass raw types such as `StartAgentsRequestProperties`, `Tts_Elevenlabs`, and `StartAgentsRequestPropertiesAsr`. Use this when you need vendor or options not exposed by the agentkit, or when integrating with generated types from the API spec. + +Use the `Agent` builder and `AgentSession` for conversational agent starts. That path generates ConvoAI REST auth and RTC join tokens from `app_id` and `app_certificate`, so application code does not need prebuilt REST tokens, RTC tokens, Customer ID, or Customer Secret. + + -## Raw telephony and phone-number APIs + - + -AgentKit focuses on realtime agent session helpers. Telephony call status, call hangup, and phone-number management are exposed through the generated low-level clients: + - + -- `client.telephony` for call status and hangup operations + -- `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + +Generated clients are still available for API surface that AgentKit does not wrap yet, such as telephony and phone-number management. + + -## Cascading flow (ASR → LLM → TTS) + +## Client setup + + ```python + from agora_agent import Agora, Area + -from agora_agent.agents import ( + - StartAgentsRequestProperties, + - StartAgentsRequestPropertiesAsr, + - StartAgentsRequestPropertiesLlm, + -) + -from agora_agent.types.eleven_labs_tts_params import ElevenLabsTtsParams + -from agora_agent.types.tts import Tts_Elevenlabs + + client = Agora( + area=Area.US, + - app_id="YOUR_APP_ID", + - app_certificate="YOUR_APP_CERTIFICATE", + - auth_token="your-rest-auth-token", + -) + -client.agents.start( + - client.app_id, + - name="unique_name", + - properties=StartAgentsRequestProperties( + - channel="channel_name", + - token="token", + - agent_rtc_uid="1001", + - remote_rtc_uids=["1002"], + - idle_timeout=120, + - asr=StartAgentsRequestPropertiesAsr( + - language="en-US", + - vendor="deepgram", + - params={"api_key": "YOUR_DEEPGRAM_API_KEY"}, + - ), + - tts=Tts_Elevenlabs( + - params=ElevenLabsTtsParams( + - key="YOUR_ELEVENLABS_API_KEY", + - model_id="eleven_flash_v2_5", + - voice_id="pNInz6obpgDQGcFmaJgB", + - sample_rate=24000, + - ), + - ), + - llm=StartAgentsRequestPropertiesLlm( + - url="https://api.openai.com/v1/chat/completions", + - api_key="", + - system_messages=[ + - {"role": "system", "content": "You are a helpful chatbot."} + - ], + - params={"model": "gpt-4o-mini"}, + - max_history=32, + - greeting_message="Hello, how can I assist you today?", + - failure_message="Please hold on a second.", + - ), + - ), + + app_id="your-app-id", + + app_certificate="your-app-certificate", + ) + ``` + + -## Async (low-level) + +## Raw telephony and phone-number APIs + + -```python + -import asyncio + -from agora_agent import Area, AsyncAgora + -from agora_agent.agents import ( + - StartAgentsRequestProperties, + - StartAgentsRequestPropertiesAsr, + - StartAgentsRequestPropertiesLlm, + -) + -from agora_agent.types.eleven_labs_tts_params import ElevenLabsTtsParams + -from agora_agent.types.tts import Tts_Elevenlabs + +AgentKit focuses on realtime agent session helpers. Use generated clients for operational APIs: + + -client = AsyncAgora( + - area=Area.US, + - app_id="YOUR_APP_ID", + - app_certificate="YOUR_APP_CERTIFICATE", + - auth_token="your-rest-auth-token", + +- `client.telephony` for call status and hangup operations + +- `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + + + +```python + +calls = client.telephony.list( + + appid=client.app_id, + + type="sip", + ) + + -async def main() -> None: + - await client.agents.start( + - client.app_id, + - name="unique_name", + - properties=StartAgentsRequestProperties( + - channel="channel_name", + - token="token", + - agent_rtc_uid="1001", + - remote_rtc_uids=["1002"], + - idle_timeout=120, + - asr=StartAgentsRequestPropertiesAsr( + - language="en-US", + - vendor="deepgram", + - params={"api_key": "YOUR_DEEPGRAM_API_KEY"}, + - ), + - tts=Tts_Elevenlabs( + - params=ElevenLabsTtsParams( + - key="YOUR_ELEVENLABS_API_KEY", + - model_id="eleven_flash_v2_5", + - voice_id="pNInz6obpgDQGcFmaJgB", + - sample_rate=24000, + - ), + - ), + - llm=StartAgentsRequestPropertiesLlm( + - url="https://api.openai.com/v1/chat/completions", + - api_key="", + - system_messages=[ + - {"role": "system", "content": "You are a helpful chatbot."} + - ], + - params={"model": "gpt-4o-mini"}, + - max_history=32, + - greeting_message="Hello, how can I assist you today?", + - failure_message="Please hold on a second.", + - ), + - ), + - ) + - + -asyncio.run(main()) + +for call in calls: + + print(call.id, call.state) + ``` + + -## MLLM flow (multimodal) + +## Direct agent APIs + + -For real-time audio with OpenAI Realtime or Google Gemini Live, use the MLLM flow instead of the cascading ASR → LLM → TTS flow. See the [MLLM Overview](https://docs.agora.io/en/conversational-ai/models/mllm/overview). + +`client.agents` exposes the generated REST surface for advanced integrations. Prefer `agent.create_session(...).start()` for new session starts because it handles auth, token generation, vendor serialization, lifecycle state, and avatar enrichment. + + -```python + -from agora_agent import Agora, Area + -from agora_agent.agents import ( + - StartAgentsRequestProperties, + - StartAgentsRequestPropertiesMllm, + - StartAgentsRequestPropertiesMllmVendor, + - StartAgentsRequestPropertiesTts, + - StartAgentsRequestPropertiesTtsVendor, + - StartAgentsRequestPropertiesLlm, + -) + +If you need an endpoint that is not wrapped by `AgentSession`, use `session.raw` after creating the session: + + -client = Agora( + - area=Area.US, + - app_id="YOUR_APP_ID", + - app_certificate="YOUR_APP_CERTIFICATE", + - auth_token="your-rest-auth-token", + -) + - + -client.agents.start( + - client.app_id, + - name="mllm_agent", + - properties=StartAgentsRequestProperties( + - channel="channel_name", + - token="your_token", + - agent_rtc_uid="1001", + - remote_rtc_uids=["1002"], + - idle_timeout=120, + - mllm=StartAgentsRequestPropertiesMllm( + - enable=True, + - url="wss://api.openai.com/v1/realtime", + - api_key="", + - vendor=StartAgentsRequestPropertiesMllmVendor.OPENAI, + - params={ + - "model": "gpt-4o-realtime-preview", + - "voice": "alloy", + - }, + - input_modalities=["audio"], + - output_modalities=["text", "audio"], + - greeting_message="Hello! I'm ready to chat in real-time.", + - turn_detection={ + - "mode": "server_vad", + - "server_vad_config": { + - "idle_timeout_ms": 5000, + - }, + - }, + - ), + - ), + +```python + +info = session.raw.get( + + appid=session.app_id, + + agent_id=session.id, + ) + ``` + + -For more on the agentkit-based MLLM flow, see [MLLM Flow](./mllm-flow.md). + +You must pass `appid` and `agent_id` manually when using generated raw methods. + theirs_snapshot: + docs/getting-started/authentication.md: | + --- + sidebar_position: 2 + title: Authentication + description: Configure the Python SDK with app credentials and understand other supported auth modes. + --- + + # Authentication + + Create `Agora` or `AsyncAgora` with `app_id` and `app_certificate` only. The SDK mints a fresh ConvoAI REST token for each API call and generates the RTC join token when the session starts. + + ## App credentials + + ```python + from agora_agent import Agent, Agora, Area, DeepgramSTT, OpenAI, MiniMaxTTS + + client = Agora( + area=Area.US, + app_id="your-app-id", + app_certificate="your-app-certificate", + ) + + agent = ( + Agent(instructions="Be concise.") + .with_stt(DeepgramSTT(model="nova-3")) + .with_llm(OpenAI(model="gpt-4o-mini")) + .with_tts(MiniMaxTTS(model="speech_2_6_turbo", voice_id="English_captivating_female1")) + ) + + session = agent.create_session( + client, + channel="room-123", + agent_uid="1", + remote_uids=["100"], + ) + ``` + + ## Why app credentials + + - Fresh short-lived tokens per API call instead of reusing long-lived credentials + - No Customer ID / Customer Secret in request headers + - No manual REST or RTC token provisioning in application code + + ## Inspecting auth mode + + ```python + print(client.auth_mode) # "app-credentials" + ``` + + ## Legacy auth modes + + The generated client still supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. Do not use those modes for new session integrations. Use app credentials so AgentKit can mint short-lived ConvoAI REST auth and RTC join tokens for each session. + docs/guides/low-level-api.md: | + --- + sidebar_position: 10 + title: Low-Level API + description: Use generated clients for escape-hatch APIs while keeping agent sessions on AgentKit. + --- + + # Low-Level API + + Use the `Agent` builder and `AgentSession` for conversational agent starts. That path generates ConvoAI REST auth and RTC join tokens from `app_id` and `app_certificate`, so application code does not need prebuilt REST tokens, RTC tokens, Customer ID, or Customer Secret. + + Generated clients are still available for API surface that AgentKit does not wrap yet, such as telephony and phone-number management. + + ## Client setup + + ```python + from agora_agent import Agora, Area + + client = Agora( + area=Area.US, + app_id="your-app-id", + app_certificate="your-app-certificate", + ) + ``` + + ## Raw telephony and phone-number APIs + + AgentKit focuses on realtime agent session helpers. Use generated clients for operational APIs: + + - `client.telephony` for call status and hangup operations + - `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + + ```python + calls = client.telephony.list( + appid=client.app_id, + type="sip", + ) + + for call in calls: + print(call.id, call.state) + ``` + + ## Direct agent APIs + + `client.agents` exposes the generated REST surface for advanced integrations. Prefer `agent.create_session(...).start()` for new session starts because it handles auth, token generation, vendor serialization, lifecycle state, and avatar enrichment. + + If you need an endpoint that is not wrapped by `AgentSession`, use `session.raw` after creating the session: + + ```python + info = session.raw.get( + appid=session.app_id, + agent_id=session.id, + ) + ``` + + You must pass `appid` and `agent_id` manually when using generated raw methods. + status: unresolved diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index f84862c..0d7a4aa 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -67,6 +67,8 @@ from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule from ..types.tts import Tts +from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule +from ..types.tts import Tts from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( AgentThinkAgentManagementRequestOnListeningAction, ) diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py index a749d1e..ddcd930 100644 --- a/src/agora_agent/agentkit/agent_session.py +++ b/src/agora_agent/agentkit/agent_session.py @@ -24,6 +24,7 @@ is_generic_avatar, is_heygen_avatar, is_live_avatar_avatar, + is_rtc_avatar, validate_avatar_config, validate_tts_sample_rate, ) diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py index 50bdd08..f48098c 100644 --- a/src/agora_agent/agentkit/vendors/avatar.py +++ b/src/agora_agent/agentkit/vendors/avatar.py @@ -177,6 +177,49 @@ def to_config(self) -> Dict[str, Any]: return {"enable": enable, "vendor": "generic", "params": params} +class GenericAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Generic avatar provider API key") + api_base_url: str = Field(..., description="Avatar provider API base URL") + avatar_id: str = Field(..., description="Avatar ID") + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + +class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "api_base_url": self.options.api_base_url, + "avatar_id": self.options.avatar_id, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_appid is not None: + params["agora_appid"] = self.options.agora_appid + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.agora_channel is not None: + params["agora_channel"] = self.options.agora_channel + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "generic", "params": params} + + class AnamAvatarOptions(BaseModel): model_config = ConfigDict(extra="forbid") diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py index b58f040..62cb3f2 100644 --- a/src/agora_agent/agentkit/vendors/mllm.py +++ b/src/agora_agent/agentkit/vendors/mllm.py @@ -1,3 +1,4 @@ +import warnings from typing import Any, Dict, List, Optional from pydantic import BaseModel, ConfigDict, Field diff --git a/tests/custom/test_agentkit_agent.py b/tests/custom/test_agentkit_agent.py new file mode 100644 index 0000000..9719b04 --- /dev/null +++ b/tests/custom/test_agentkit_agent.py @@ -0,0 +1,298 @@ +from agora_agent.agentkit import ( + Agent, + AvatarConfig, + AvatarVendor, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, + SttConfig, + SttVendor, + TtsConfig, +) +import pytest + +from agora_agent.agentkit.vendors import ( + AkoolAvatar, + ElevenLabsTTS, + LiveAvatarAvatar, + OpenAI, + OpenAIRealtime, +) + + +def _parameter(config, key): + parameters = config["parameters"] + if isinstance(parameters, dict): + return parameters[key] + return getattr(parameters, key) + + +class _CopyOnlyModel: + def __init__(self, **values): + self.values = values + + def copy(self, update=None): + return _CopyOnlyModel(**{**self.values, **(update or {})}) + + +def test_generated_core_aliases_are_public(): + assert LlmConfig is not None + assert LlmStyle is not None + assert SttConfig is not None + assert SttVendor is not None + assert TtsConfig is not None + assert MllmConfig is not None + assert MllmVendor is not None + assert AvatarConfig is not None + assert AvatarVendor is not None + + +def test_model_copy_helper_supports_pydantic_v1_copy_api(): + copied = Agent._copy_model_update(_CopyOnlyModel(enable_rtm=True), {"data_channel": "rtm"}) # noqa: SLF001 + + assert copied.values == {"enable_rtm": True, "data_channel": "rtm"} + + +def test_with_audio_scenario_sets_session_parameter(): + agent = Agent(name="test").with_audio_scenario("chorus") + + assert _parameter(agent.config, "audio_scenario") == "chorus" + + +def test_with_audio_scenario_preserves_existing_parameters(): + agent = Agent(name="test", parameters={"enable_metrics": True}).with_audio_scenario( + "chorus" + ) + + assert _parameter(agent.config, "enable_metrics") is True + assert _parameter(agent.config, "audio_scenario") == "chorus" + + +def test_enable_rtm_defaults_data_channel_to_rtm(): + properties = Agent(name="test", advanced_features={"enable_rtm": True}).to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + skip_vendor_validation=True, + ) + + assert properties.parameters.data_channel == "rtm" + + +def test_enable_rtm_preserves_explicit_data_channel(): + properties = Agent( + name="test", + advanced_features={"enable_rtm": True}, + parameters={"data_channel": "datastream"}, + ).to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + skip_vendor_validation=True, + ) + + assert properties.parameters.data_channel == "datastream" + + +def test_agent_level_llm_fields_override_vendor_defaults(): + agent = ( + Agent(name="test") + .with_llm( + OpenAI( + api_key="llm-key", + greeting_message="vendor greeting", + failure_message="vendor failure", + max_history=1, + ) + ) + .with_tts(ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + + properties = agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + ) + + assert properties.llm.greeting_message == "agent greeting" + assert properties.llm.failure_message == "agent failure" + assert properties.llm.max_history == 2 + + +def test_avatar_sample_rate_validation_works_when_tts_added_after_avatar(): + agent = Agent(name="test").with_avatar( + LiveAvatarAvatar(api_key="avatar-key", quality="medium", agora_uid="2") + ) + + with pytest.raises(ValueError, match="24000"): + agent.with_tts( + ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice", sample_rate=16000) + ) + + +def test_avatar_sample_rate_validation_uses_wrapper_sample_rate(): + agent = ( + Agent(name="test") + .with_avatar(AkoolAvatar(api_key="avatar-key")) + .with_tts( + ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice", sample_rate=16000) + ) + ) + + assert agent.tts_sample_rate == 16000 + + +def test_with_mllm_removes_deprecated_advanced_features_enable_mllm(): + properties = ( + Agent( + name="test", + advanced_features={"enable_mllm": True, "enable_rtm": True}, + greeting="hello from agent", + failure_message="try again", + max_history=5, + ) + .with_mllm(OpenAIRealtime(api_key="openai-key")) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None + assert properties.mllm.enable is True + assert properties.mllm.greeting_message == "hello from agent" + assert properties.mllm.failure_message == "try again" + mllm_dump = properties.mllm.model_dump(exclude_none=True) + assert "max_history" not in mllm_dump + assert properties.advanced_features is not None + af_dump = properties.advanced_features.model_dump(exclude_none=True) + assert "enable_mllm" not in af_dump + assert af_dump.get("enable_rtm") is True + + +def test_to_properties_rejects_mllm_with_enabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + + +def test_to_properties_mllm_with_avatar_fires_before_token_generation(): + """The guard must fire before the token-generation step so callers get a + clear, actionable error even when app_id/app_certificate are empty. + """ + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + app_id="", + app_certificate="", + ) + + +def test_to_properties_rejects_mllm_with_default_enabled_avatar(): + """Avatar with no `enable` field should be treated as enabled.""" + agent = Agent(name="test").with_mllm(OpenAIRealtime(api_key="mllm-key")) + agent._avatar = { # noqa: SLF001 + "vendor": "liveavatar", + "params": { + "api_key": "avatar-key", + "quality": "high", + "agora_uid": "200", + "agora_token": "avatar-token", + }, + } + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + + +def test_to_properties_allows_mllm_with_disabled_avatar_and_no_tts(): + properties = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + enable=False, + ) + ) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None and properties.mllm.enable is True + assert properties.tts is None + assert properties.llm is None + assert properties.asr is None + assert properties.avatar is not None and properties.avatar.enable is False + + +def test_to_properties_mllm_without_tts_or_llm_succeeds(): + properties = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None and properties.mllm.enable is True + assert properties.tts is None + assert properties.llm is None + assert properties.asr is None + assert properties.avatar is None diff --git a/tests/custom/test_agentkit_session.py b/tests/custom/test_agentkit_session.py new file mode 100644 index 0000000..198fcd0 --- /dev/null +++ b/tests/custom/test_agentkit_session.py @@ -0,0 +1,383 @@ +from types import SimpleNamespace + +import pytest + +from agora_agent.agentkit import Agent, AgentSession +from agora_agent.agentkit.vendors import ( + ElevenLabsTTS, + GenericAvatar, + LiveAvatarAvatar, + OpenAI, + OpenAIRealtime, + RimeTTS, +) +from agora_agent.agents.types.get_turns_agents_response import GetTurnsAgentsResponse + + +APP_ID = "0" * 32 +APP_CERTIFICATE = "1" * 32 + + +class _Agents: + def __init__(self): + self.calls = [] + self.start_calls = [] + + def start(self, app_id, name, properties, preset=None, pipeline_id=None, request_options=None): + self.start_calls.append((app_id, name, properties, preset, pipeline_id, request_options)) + return SimpleNamespace(agent_id="agent-1") + + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + self.calls.append((app_id, agent_id, page_index, page_size, request_options)) + is_last_page = page_index != 1 + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=2, + pagination={ + "page_index": page_index or 1, + "total_pages": 2, + "is_last_page": is_last_page, + }, + turns=[{"turn_id": float(page_index or 1)}], + ) + + +class _Client: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _Agents() + self.agent_management = object() + + +def _session(agent, warn=None): + return AgentSession( + client=_Client(), + agent=agent, + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + warn=warn, + ) + + +def test_generic_avatar_enrichment_adds_session_context_and_token(): + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + ) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + params = properties["avatar"]["params"] + assert params["agora_appid"] == APP_ID + assert params["agora_channel"] == "room" + assert params["agora_token"] + assert params["agora_token"] != properties["token"] + + +def test_generic_avatar_empty_session_fields_are_filled(): + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + agora_appid="", + agora_channel="", + agora_token="", + ) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + params = properties["avatar"]["params"] + assert params["agora_appid"] == APP_ID + assert params["agora_channel"] == "room" + assert params["agora_token"] + + +def test_avatar_uid_matching_agent_uid_warns(): + warnings = [] + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="1", + ) + ) + session = _session(agent, warn=warnings.append) + + session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert any("matches agent_rtc_uid" in warning for warning in warnings) + + +def test_session_start_properties_agent_level_llm_fields_override_vendor_defaults(): + agent = ( + Agent(name="test") + .with_llm( + OpenAI( + api_key="llm-key", + greeting_message="vendor greeting", + failure_message="vendor failure", + max_history=1, + ) + ) + .with_tts(ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["llm"]["greeting_message"] == "agent greeting" + assert properties["llm"]["failure_message"] == "agent failure" + assert properties["llm"]["max_history"] == 2 + + +def test_session_start_properties_applies_mllm_agent_level_defaults(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["mllm"]["greeting_message"] == "agent greeting" + assert properties["mllm"]["failure_message"] == "agent failure" + assert "max_history" not in properties["mllm"] + + +def test_session_start_properties_preserves_mllm_vendor_defaults(): + agent = ( + Agent(name="test") + .with_mllm( + OpenAIRealtime( + api_key="mllm-key", + greeting_message="vendor greeting", + failure_message="vendor failure", + ) + ) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["mllm"]["greeting_message"] == "vendor greeting" + assert properties["mllm"]["failure_message"] == "vendor failure" + assert "max_history" not in properties["mllm"] + + +def test_session_start_allows_mllm_without_tts(): + agent = Agent(name="test").with_mllm(OpenAIRealtime(api_key="mllm-key")) + session = _session(agent) + + assert session.start() == "agent-1" + + +def test_session_start_rejects_mllm_with_enabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + session = _session(agent) + + with pytest.raises(ValueError, match="cascading"): + session.start() + assert session._client.agents.start_calls == [] # noqa: SLF001 + + +def test_session_start_allows_mllm_with_disabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + enable=False, + ) + ) + ) + session = _session(agent) + + assert session.start() == "agent-1" + + +def test_avatar_sample_rate_validation_uses_serialized_vendor_keys(): + warnings = [] + agent = ( + Agent(name="test") + .with_avatar(LiveAvatarAvatar(api_key="avatar-key", quality="medium", agora_uid="2")) + .with_tts(RimeTTS(key="tts-key", speaker="speaker", sampling_rate=24000)) + ) + session = _session(agent, warn=warnings.append) + + session._validate_avatar_config() # noqa: SLF001 + + assert warnings == [] + + +def test_avatar_user_token_is_not_overwritten(): + agent = Agent(name="test").with_avatar( + LiveAvatarAvatar( + api_key="live-key", + quality="medium", + agora_uid="2", + agora_token="user-token", + ) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["avatar"]["params"]["agora_token"] == "user-token" + + +def test_get_turns_forwards_pagination_args(): + session = _session(Agent(name="test")) + session._agent_id = "agent-id" # noqa: SLF001 + + session.get_turns(page_index=3, page_size=25) + + assert session._client.agents.calls[-1][:4] == (APP_ID, "agent-id", 3, 25) # noqa: SLF001 + + +def test_get_all_turns_aggregates_pages(): + session = _session(Agent(name="test")) + session._agent_id = "agent-id" # noqa: SLF001 + + response = session.get_all_turns(page_size=1) + + assert [turn.turn_id for turn in response.turns] == [1.0, 2.0] + assert response.pagination.page_index == 2 + + +def test_get_all_turns_raises_when_pagination_does_not_advance(): + class _StuckAgents: + def __init__(self): + self.calls = 0 + + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + self.calls += 1 + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=2, + pagination={"page_index": 1, "is_last_page": False}, + turns=[{"turn_id": 1.0}], + ) + + class _StuckClient: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _StuckAgents() + self.agent_management = object() + + session = AgentSession( + client=_StuckClient(), + agent=Agent(name="test"), + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + ) + session._agent_id = "agent-id" # noqa: SLF001 + + with pytest.raises(RuntimeError, match="did not advance"): + session.get_all_turns(page_size=1) + + +def test_get_all_turns_raises_when_pagination_metadata_missing(): + class _NoMetaAgents: + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=1, + pagination={"is_last_page": False}, + turns=[{"turn_id": 1.0}], + ) + + class _NoMetaClient: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _NoMetaAgents() + self.agent_management = object() + + session = AgentSession( + client=_NoMetaClient(), + agent=Agent(name="test"), + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + ) + session._agent_id = "agent-id" # noqa: SLF001 + + with pytest.raises(RuntimeError, match="cannot continue"): + session.get_all_turns(page_size=1) diff --git a/tests/custom/test_agentkit_vendors.py b/tests/custom/test_agentkit_vendors.py new file mode 100644 index 0000000..8473821 --- /dev/null +++ b/tests/custom/test_agentkit_vendors.py @@ -0,0 +1,122 @@ +import pytest +from pydantic import ValidationError + +from agora_agent.agentkit import LlmGreetingConfigs +import warnings + +from agora_agent.agentkit.vendors import GenericAvatar, OpenAI, OpenAIRealtime, XaiGrok, XaiRealtime + + +def test_xai_grok_serializes_v27_shape_without_style(): + config = XaiGrok( + api_key="xai-key", + voice="eve", + language="en", + sample_rate=24000, + output_modalities=["audio", "text"], + params={"temperature": 0.2}, + ).to_config() + + assert config["vendor"] == "xai" + assert config["url"] == "wss://api.x.ai/v1/realtime" + assert config["api_key"] == "xai-key" + assert config["params"] == { + "temperature": 0.2, + "voice": "eve", + "language": "en", + "sample_rate": 24000, + } + assert config["output_modalities"] == ["audio", "text"] + assert "style" not in config + + +def test_xai_grok_emits_params_even_when_empty(): + assert XaiGrok(api_key="xai-key").to_config()["params"] == {} + + +def test_xai_realtime_deprecated_alias_emits_same_vendor(): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always", DeprecationWarning) + config = XaiRealtime(api_key="xai-key").to_config() + assert len(caught) == 1 + assert issubclass(caught[0].category, DeprecationWarning) + assert config["vendor"] == "xai" + + +def test_mllm_rejects_fields_not_in_core_contract(): + with pytest.raises(ValidationError): + OpenAIRealtime(api_key="openai-key", predefined_tools=["_publish_message"]) + + with pytest.raises(ValidationError): + XaiGrok(api_key="xai-key", max_history=10) + + +def test_generic_avatar_omits_session_enriched_fields_when_unset(): + config = GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + ).to_config() + + assert config == { + "enable": True, + "vendor": "generic", + "params": { + "api_key": "avatar-key", + "api_base_url": "https://avatar.example.com", + "avatar_id": "avatar-1", + "agora_uid": "2", + }, + } + + +def test_vertex_ai_explicit_fields_override_additional_params(): + from agora_agent.agentkit.vendors import VertexAI + + config = VertexAI( + model="explicit-model", + project_id="explicit-project", + location="explicit-region", + adc_credentials_string="{}", + additional_params={ + "model": "should-be-overridden", + "project_id": "should-be-overridden", + "location": "should-be-overridden", + "adc_credentials_string": "should-be-overridden", + "extra_key": "kept", + }, + ).to_config() + + assert config["vendor"] == "vertexai" + assert config["params"]["model"] == "explicit-model" + assert config["params"]["project_id"] == "explicit-project" + assert config["params"]["location"] == "explicit-region" + assert config["params"]["adc_credentials_string"] == "{}" + assert config["params"]["extra_key"] == "kept" + + +def test_gemini_live_explicit_fields_override_additional_params(): + from agora_agent.agentkit.vendors import GeminiLive + + config = GeminiLive( + api_key="key", + model="explicit-model", + additional_params={ + "model": "should-be-overridden", + "extra_key": "kept", + }, + ).to_config() + + assert config["params"]["model"] == "explicit-model" + assert config["params"]["extra_key"] == "kept" + + +def test_llm_greeting_configs_interruptable_serializes(): + config = OpenAI( + api_key="openai-key", + greeting_configs=LlmGreetingConfigs(mode="single_first", interruptable=False), + ).to_config() + + assert config["greeting_configs"]["mode"] == "single_first" + assert config["greeting_configs"]["interruptable"] is False diff --git a/tests/custom/test_avatar_token.py b/tests/custom/test_avatar_token.py new file mode 100644 index 0000000..fa73fc0 --- /dev/null +++ b/tests/custom/test_avatar_token.py @@ -0,0 +1,12 @@ +from agora_agent.agentkit import generate_convo_ai_token + + +def test_avatar_tokens_use_convo_ai_token_path_with_avatar_uid(): + token = generate_convo_ai_token( + app_id="0" * 32, + app_certificate="1" * 32, + channel_name="room", + uid=123, + ) + + assert token.startswith("007") diff --git a/tests/custom/test_llm_vendors.py b/tests/custom/test_llm_vendors.py new file mode 100644 index 0000000..faca9bf --- /dev/null +++ b/tests/custom/test_llm_vendors.py @@ -0,0 +1,60 @@ +from agora_agent import AmazonBedrock, CustomLLM, Dify, Groq, VertexAILLM + + +def test_groq_serializes_as_openai_compatible() -> None: + config = Groq(api_key="groq-key", model="llama-3.3-70b-versatile").to_config() + + assert config["url"] == "https://api.groq.com/openai/v1/chat/completions" + assert config["api_key"] == "groq-key" + assert config["style"] == "openai" + assert config["params"]["model"] == "llama-3.3-70b-versatile" + + +def test_custom_llm_marks_request_as_custom() -> None: + config = CustomLLM(api_key="key", model="model", base_url="https://llm.example.com/chat").to_config() + + assert config["url"] == "https://llm.example.com/chat" + assert config["api_key"] == "key" + assert config["vendor"] == "custom" + assert config["style"] == "openai" + + +def test_vertex_ai_llm_includes_project_routing() -> None: + config = VertexAILLM( + api_key="vertex-token", + model="gemini-2.0-flash", + project_id="project", + location="us-central1", + ).to_config() + + assert config["api_key"] == "vertex-token" + assert config["style"] == "gemini" + assert config["params"]["model"] == "gemini-2.0-flash" + assert config["params"]["project_id"] == "project" + assert config["params"]["location"] == "us-central1" + + +def test_amazon_bedrock_serializes_as_anthropic_style() -> None: + config = AmazonBedrock( + api_key="bedrock-key", + url="https://bedrock.example.com/messages", + model="anthropic.claude-3-5-sonnet-20241022-v2:0", + ).to_config() + + assert config["api_key"] == "bedrock-key" + assert config["style"] == "anthropic" + assert config["params"]["model"] == "anthropic.claude-3-5-sonnet-20241022-v2:0" + + +def test_dify_serializes_conversation_fields() -> None: + config = Dify( + api_key="dify-key", + url="https://api.dify.ai/v1/chat-messages", + user="user-1", + conversation_id="conversation-1", + ).to_config() + + assert config["api_key"] == "dify-key" + assert config["style"] == "dify" + assert config["params"]["user"] == "user-1" + assert config["params"]["conversation_id"] == "conversation-1" diff --git a/tests/custom/test_root_exports.py b/tests/custom/test_root_exports.py new file mode 100644 index 0000000..9b2f508 --- /dev/null +++ b/tests/custom/test_root_exports.py @@ -0,0 +1,29 @@ +import pytest + +import agora_agent +import agora_agent.agentkit as agentkit + + +def test_root_exports_match_agentkit_for_common_symbols() -> None: + for name in ("Agent", "DeepgramSTT", "OpenAI", "AgentPresets", "generate_rtc_token", "DataChannel"): + assert getattr(agora_agent, name) is getattr(agentkit, name) + + +def test_root_exports_fern_client_symbols() -> None: + assert agora_agent.Agora is not None + assert agora_agent.Area is not None + assert agora_agent.AsyncAgora is not None + + +def test_unknown_root_export_raises_attribute_error() -> None: + with pytest.raises(AttributeError): + _ = agora_agent.NotARealExportName + + +def test_dir_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in dir(agora_agent) + + +def test_all_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in agora_agent.__all__ + assert "OpenAI" in agora_agent.__all__ From a94bac6d7c71d1df7e6f796b1dea37734d347bc8 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Mon, 1 Jun 2026 20:54:57 -0400 Subject: [PATCH 06/26] Align AgentKit provider wrappers with regenerated core schemas --- src/agora_agent/agentkit/agent.py | 40 ++++---- src/agora_agent/agentkit/vendors/llm.py | 51 ++++++++-- src/agora_agent/agentkit/vendors/mllm.py | 65 ++++++++++--- src/agora_agent/agentkit/vendors/stt.py | 13 ++- src/agora_agent/agentkit/vendors/tts.py | 117 +++++++++++++++++------ tests/custom/test_llm_vendors.py | 15 +-- tests/custom/test_stt_language.py | 9 +- tests/custom/test_tts_vendors.py | 49 ++++++++++ 8 files changed, 277 insertions(+), 82 deletions(-) create mode 100644 tests/custom/test_tts_vendors.py diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index 7647818..bc1f803 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -8,14 +8,8 @@ from .agent_session import AgentSession, AsyncAgentSession from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties -from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr -from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor -from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm -from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle -from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm -from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties from ..agents.types.get_agents_response import GetAgentsResponse from ..agents.types.list_agents_response import ListAgentsResponse @@ -52,11 +46,6 @@ from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode -from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection -from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode -from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs -from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode -from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures @@ -67,6 +56,13 @@ from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule from ..types.tts import Tts +from ..types.asr import Asr +from ..types.llm import Llm +from ..types.llm_style import LlmStyle as GeneratedLlmStyle +from ..types.mllm import Mllm +from ..types.mllm_turn_detection import MllmTurnDetection +from ..types.mllm_turn_detection_mode import MllmTurnDetectionMode as GeneratedMllmTurnDetectionMode +from ..types.mllm_vendor import MllmVendor as GeneratedMllmVendor from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( AgentThinkAgentManagementRequestOnListeningAction, ) @@ -82,14 +78,14 @@ from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS # Top-level aliases -LlmConfig = StartAgentsRequestPropertiesLlm -LlmStyle = StartAgentsRequestPropertiesLlmStyle -SttConfig = StartAgentsRequestPropertiesAsr +LlmConfig = Llm +LlmStyle = GeneratedLlmStyle +SttConfig = Asr AsrConfig = SttConfig -SttVendor = StartAgentsRequestPropertiesAsrVendor +SttVendor = typing.Any TtsConfig = Tts -MllmConfig = StartAgentsRequestPropertiesMllm -MllmVendor = StartAgentsRequestPropertiesMllmVendor +MllmConfig = Mllm +MllmVendor = GeneratedMllmVendor AvatarConfig = StartAgentsRequestPropertiesAvatar AvatarVendor = StartAgentsRequestPropertiesAvatarVendor TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection @@ -133,8 +129,8 @@ ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario InterruptionConfig = StartAgentsRequestPropertiesInterruption InterruptionMode = StartAgentsRequestPropertiesInterruptionMode -MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection -MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode +MllmTurnDetectionConfig = MllmTurnDetection +MllmTurnDetectionMode = GeneratedMllmTurnDetectionMode AgentConfig = StartAgentsRequestProperties AgentConfigUpdate = UpdateAgentsRequestProperties SessionInfo = GetAgentsResponse @@ -192,9 +188,9 @@ class SessionOptions(typing_extensions.TypedDict, total=False): warn: typing.Callable[[str], None] # LLM sub-type aliases -LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs -LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode -McpServersItem = StartAgentsRequestPropertiesLlmMcpServersItem +LlmGreetingConfigs = typing.Dict[str, typing.Any] +LlmGreetingConfigsMode = typing.Any +McpServersItem = typing.Dict[str, typing.Any] # Additional top-level config aliases GeofenceConfig = StartAgentsRequestPropertiesGeofence diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py index b521867..3617879 100644 --- a/src/agora_agent/agentkit/vendors/llm.py +++ b/src/agora_agent/agentkit/vendors/llm.py @@ -2,12 +2,9 @@ from pydantic import BaseModel, ConfigDict, Field -from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( - StartAgentsRequestPropertiesLlmGreetingConfigs, -) from .base import BaseLLM -LlmGreetingConfigs = Union[StartAgentsRequestPropertiesLlmGreetingConfigs, Dict[str, Any]] +LlmGreetingConfigs = Dict[str, Any] def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: @@ -375,8 +372,11 @@ def to_config(self) -> Dict[str, Any]: class AmazonBedrockOptions(AnthropicOptions): model_config = ConfigDict(extra="forbid") - api_key: str = Field(..., description="Amazon Bedrock API key or gateway token") - url: str = Field(..., description="Amazon Bedrock proxy or runtime endpoint") + access_key: str = Field(..., description="AWS access key ID") + secret_key: str = Field(..., description="AWS secret access key") + region: str = Field(..., description="AWS region") + api_key: Optional[str] = Field(default=None, description="Unused; kept for AnthropicOptions compatibility") + url: Optional[str] = Field(default=None, description="Unused; kept for AnthropicOptions compatibility") class AmazonBedrock(BaseLLM): @@ -384,7 +384,44 @@ def __init__(self, **kwargs: Any): self.options = AmazonBedrockOptions(**kwargs) def to_config(self) -> Dict[str, Any]: - return Anthropic(**_dump_optional_model(self.options)).to_config() + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "access_key": self.options.access_key, + "secret_key": self.options.secret_key, + "region": self.options.region, + "model": self.options.model, + "params": params, + "style": "bedrock", + "input_modalities": self.options.input_modalities or ["text"], + } + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + return config class DifyOptions(BaseModel): diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py index b58f040..e5fcb5b 100644 --- a/src/agora_agent/agentkit/vendors/mllm.py +++ b/src/agora_agent/agentkit/vendors/mllm.py @@ -2,12 +2,10 @@ from pydantic import BaseModel, ConfigDict, Field -from ...agents.types.start_agents_request_properties_mllm_turn_detection import ( - StartAgentsRequestPropertiesMllmTurnDetection, -) +from ...types.mllm_turn_detection import MllmTurnDetection from .base import BaseMLLM -MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection +MllmTurnDetectionConfig = MllmTurnDetection class OpenAIRealtimeOptions(BaseModel): @@ -15,6 +13,9 @@ class OpenAIRealtimeOptions(BaseModel): api_key: str = Field(..., description="OpenAI API key") model: Optional[str] = Field(default=None, description="Model name (e.g., gpt-4o-realtime-preview)") + voice: Optional[str] = Field(default=None, description="Voice identifier") + instructions: Optional[str] = Field(default=None, description="System instructions") + input_audio_transcription: Optional[Dict[str, Any]] = Field(default=None, description="Audio transcription settings") url: Optional[str] = Field(default=None, description="WebSocket URL") greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") @@ -36,13 +37,25 @@ def to_config(self) -> Dict[str, Any]: if self.options.url is not None: config["url"] = self.options.url - if self.options.model is not None: - params = {"model": self.options.model} + if ( + self.options.model is not None + or self.options.params is not None + or self.options.voice is not None + or self.options.instructions is not None + or self.options.input_audio_transcription is not None + ): + params = {} + if self.options.model is not None: + params["model"] = self.options.model if self.options.params is not None: params.update(self.options.params) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.input_audio_transcription is not None: + params["input_audio_transcription"] = self.options.input_audio_transcription config["params"] = params - elif self.options.params is not None: - config["params"] = self.options.params if self.options.greeting_message is not None: config["greeting_message"] = self.options.greeting_message if self.options.input_modalities is not None: @@ -128,6 +141,11 @@ class VertexAIOptions(BaseModel): adc_credentials_string: str = Field(..., description="Application Default Credentials JSON string") instructions: Optional[str] = Field(default=None, description="System instructions") voice: Optional[str] = Field(default=None, description="Voice name (e.g., Aoede, Charon)") + affective_dialog: Optional[bool] = Field(default=None, description="Enable affective dialog") + proactive_audio: Optional[bool] = Field(default=None, description="Enable proactive audio") + transcribe_agent: Optional[bool] = Field(default=None, description="Transcribe agent speech") + transcribe_user: Optional[bool] = Field(default=None, description="Transcribe user speech") + http_options: Optional[Dict[str, Any]] = Field(default=None, description="HTTP options") greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") @@ -145,16 +163,26 @@ def to_config(self) -> Dict[str, Any]: # matching the TypeScript SDK. params: Dict[str, Any] = dict(self.options.additional_params or {}) params["model"] = self.options.model - params["project_id"] = self.options.project_id - params["location"] = self.options.location - params["adc_credentials_string"] = self.options.adc_credentials_string if self.options.instructions is not None: params["instructions"] = self.options.instructions if self.options.voice is not None: params["voice"] = self.options.voice + if self.options.affective_dialog is not None: + params["affective_dialog"] = self.options.affective_dialog + if self.options.proactive_audio is not None: + params["proactive_audio"] = self.options.proactive_audio + if self.options.transcribe_agent is not None: + params["transcribe_agent"] = self.options.transcribe_agent + if self.options.transcribe_user is not None: + params["transcribe_user"] = self.options.transcribe_user + if self.options.http_options is not None: + params["http_options"] = self.options.http_options config: Dict[str, Any] = { "vendor": "vertexai", + "project_id": self.options.project_id, + "location": self.options.location, + "adc_credentials_string": self.options.adc_credentials_string, "params": params, } @@ -184,6 +212,11 @@ class GeminiLiveOptions(BaseModel): url: Optional[str] = Field(default=None, description="WebSocket URL") instructions: Optional[str] = Field(default=None, description="System instructions") voice: Optional[str] = Field(default=None, description="Voice name") + affective_dialog: Optional[bool] = Field(default=None, description="Enable affective dialog") + proactive_audio: Optional[bool] = Field(default=None, description="Enable proactive audio") + transcribe_agent: Optional[bool] = Field(default=None, description="Transcribe agent speech") + transcribe_user: Optional[bool] = Field(default=None, description="Transcribe user speech") + http_options: Optional[Dict[str, Any]] = Field(default=None, description="HTTP options") greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") @@ -205,6 +238,16 @@ def to_config(self) -> Dict[str, Any]: params["instructions"] = self.options.instructions if self.options.voice is not None: params["voice"] = self.options.voice + if self.options.affective_dialog is not None: + params["affective_dialog"] = self.options.affective_dialog + if self.options.proactive_audio is not None: + params["proactive_audio"] = self.options.proactive_audio + if self.options.transcribe_agent is not None: + params["transcribe_agent"] = self.options.transcribe_agent + if self.options.transcribe_user is not None: + params["transcribe_user"] = self.options.transcribe_user + if self.options.http_options is not None: + params["http_options"] = self.options.http_options config: Dict[str, Any] = { "vendor": "gemini", diff --git a/src/agora_agent/agentkit/vendors/stt.py b/src/agora_agent/agentkit/vendors/stt.py index 73acc44..48aa43b 100644 --- a/src/agora_agent/agentkit/vendors/stt.py +++ b/src/agora_agent/agentkit/vendors/stt.py @@ -129,7 +129,7 @@ class MicrosoftSTTOptions(BaseModel): key: str = Field(..., description="Azure subscription key") region: str = Field(..., description="Azure region (e.g., eastus)") - language: Optional[str] = Field(default=None, description="Language code (e.g., en-US)") + language: str = Field(..., description="Language code (e.g., en-US)") interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") additional_params: Optional[Dict[str, Any]] = Field(default=None) @@ -175,15 +175,14 @@ def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = dict(self.options.additional_params or {}) params["api_key"] = self.options.api_key - transcription = dict(self.options.input_audio_transcription or {}) + transcription = {"model": "whisper-1", **(self.options.input_audio_transcription or {})} if self.options.model is not None: transcription["model"] = self.options.model if self.options.prompt is not None: transcription["prompt"] = self.options.prompt if self.options.language is not None: transcription["language"] = self.options.language - if transcription: - params["input_audio_transcription"] = transcription + params["input_audio_transcription"] = transcription config: Dict[str, Any] = { "vendor": "openai", @@ -201,7 +200,7 @@ class GoogleSTTOptions(BaseModel): project_id: str = Field(..., description="Google Cloud project ID") location: str = Field(..., description="Google Cloud region") adc_credentials_string: str = Field(..., description="Google service account credentials JSON string") - language: Optional[str] = Field(default=None, description="Language code (e.g., en-US)") + language: str = Field(..., description="Language code (e.g., en-US)") interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") model: Optional[str] = Field(default=None, description="Recognition model") additional_params: Optional[Dict[str, Any]] = Field(default=None) @@ -239,7 +238,7 @@ class AmazonSTTOptions(BaseModel): access_key: str = Field(..., description="AWS Access Key ID") secret_key: str = Field(..., description="AWS Secret Access Key") region: str = Field(..., description="AWS region (e.g., us-east-1)") - language: Optional[str] = Field(default=None, description="Language code") + language: str = Field(..., description="Language code") interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") additional_params: Optional[Dict[str, Any]] = Field(default=None) @@ -271,7 +270,7 @@ class AssemblyAISTTOptions(BaseModel): model_config = ConfigDict(extra="forbid") api_key: str = Field(..., description="AssemblyAI API key") - language: Optional[str] = Field(default=None, description="Language code") + language: str = Field(..., description="Language code") interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") uri: Optional[str] = Field(default=None, description="AssemblyAI streaming WebSocket URL") additional_params: Optional[Dict[str, Any]] = Field(default=None) diff --git a/src/agora_agent/agentkit/vendors/tts.py b/src/agora_agent/agentkit/vendors/tts.py index 557ea56..fb3692a 100644 --- a/src/agora_agent/agentkit/vendors/tts.py +++ b/src/agora_agent/agentkit/vendors/tts.py @@ -63,6 +63,8 @@ class MicrosoftTTSOptions(BaseModel): region: str = Field(..., description="Azure region (e.g., eastus)") voice_name: str = Field(..., description="Voice name") sample_rate: Optional[MicrosoftSampleRate] = Field(default=None, description="Sample rate in Hz") + speed: Optional[float] = Field(default=None, description="Speaking rate multiplier") + volume: Optional[float] = Field(default=None, description="Audio volume") skip_patterns: Optional[List[int]] = Field(default=None) class MicrosoftTTS(BaseTTS): @@ -82,6 +84,10 @@ def to_config(self) -> Dict[str, Any]: if self.options.sample_rate is not None: params["sample_rate"] = self.options.sample_rate + if self.options.speed is not None: + params["speed"] = self.options.speed + if self.options.volume is not None: + params["volume"] = self.options.volume result: Dict[str, Any] = {"vendor": "microsoft", "params": params} if self.options.skip_patterns is not None: @@ -95,7 +101,9 @@ class OpenAITTSOptions(BaseModel): api_key: Optional[str] = Field(default=None, description="OpenAI API key") voice: str = Field(..., description="Voice name (alloy, echo, fable, onyx, nova, shimmer)") model: Optional[str] = Field(default=None, description="Model name (tts-1, tts-1-hd)") + base_url: Optional[str] = Field(default=None, description="Endpoint URL") response_format: Optional[str] = Field(default=None, description="Audio format (e.g., pcm)") + instructions: Optional[str] = Field(default=None, description="Custom voice instructions") speed: Optional[float] = Field(default=None, description="Speech speed multiplier") skip_patterns: Optional[List[int]] = Field(default=None) @@ -113,11 +121,15 @@ def to_config(self) -> Dict[str, Any]: } if self.options.api_key is not None: params["api_key"] = self.options.api_key + if self.options.base_url is not None: + params["base_url"] = self.options.base_url if self.options.model is not None: params["model"] = self.options.model if self.options.response_format is not None: params["response_format"] = self.options.response_format + if self.options.instructions is not None: + params["instructions"] = self.options.instructions if self.options.speed is not None: params["speed"] = self.options.speed @@ -132,7 +144,9 @@ class CartesiaTTSOptions(BaseModel): api_key: str = Field(..., description="Cartesia API key") voice_id: str = Field(..., description="Voice ID") - model_id: Optional[str] = Field(default=None, description="Model ID") + model_id: str = Field(..., description="Model ID") + base_url: Optional[str] = Field(default=None, description="WebSocket URL") + language: Optional[str] = Field(default=None, description="Target language") sample_rate: Optional[CartesiaSampleRate] = Field(default=None, description="Sample rate in Hz") skip_patterns: Optional[List[int]] = Field(default=None) @@ -147,13 +161,16 @@ def sample_rate(self) -> Optional[int]: def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { "api_key": self.options.api_key, + "model_id": self.options.model_id, "voice": {"mode": "id", "id": self.options.voice_id}, } - if self.options.model_id is not None: - params["model_id"] = self.options.model_id + if self.options.base_url is not None: + params["base_url"] = self.options.base_url if self.options.sample_rate is not None: - params["sample_rate"] = self.options.sample_rate + params["output_format"] = {"container": "raw", "sample_rate": self.options.sample_rate} + if self.options.language is not None: + params["language"] = self.options.language result: Dict[str, Any] = {"vendor": "cartesia", "params": params} if self.options.skip_patterns is not None: @@ -164,7 +181,7 @@ def to_config(self) -> Dict[str, Any]: class GoogleTTSOptions(BaseModel): model_config = ConfigDict(extra="forbid") - key: str = Field(..., description="Google Cloud API key") + key: str = Field(..., description="Google Cloud service account credentials JSON string") voice_name: str = Field(..., description="Voice name") language_code: Optional[str] = Field(default=None, description="Language code (e.g., en-US)") sample_rate_hertz: Optional[GoogleTTSSampleRate] = Field(default=None, description="Sample rate in Hz") @@ -180,14 +197,14 @@ def sample_rate(self) -> Optional[int]: def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { - "key": self.options.key, - "voice_name": self.options.voice_name, + "credentials": self.options.key, + "VoiceSelectionParams": {"name": self.options.voice_name}, } if self.options.language_code is not None: - params["language_code"] = self.options.language_code + params["VoiceSelectionParams"]["language_code"] = self.options.language_code if self.options.sample_rate_hertz is not None: - params["sample_rate_hertz"] = self.options.sample_rate_hertz + params["AudioConfig"] = {"sample_rate_hertz": self.options.sample_rate_hertz} result: Dict[str, Any] = {"vendor": "google", "params": params} if self.options.skip_patterns is not None: @@ -202,6 +219,7 @@ class AmazonTTSOptions(BaseModel): secret_key: str = Field(..., description="AWS secret key") region: str = Field(..., description="AWS region (e.g., us-east-1)") voice_id: str = Field(..., description="Amazon Polly voice ID") + engine: Optional[str] = Field(default=None, description="Amazon Polly engine type") skip_patterns: Optional[List[int]] = Field(default=None) class AmazonTTS(BaseTTS): @@ -214,11 +232,13 @@ def sample_rate(self) -> Optional[int]: def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { - "access_key": self.options.access_key, - "secret_key": self.options.secret_key, - "region": self.options.region, - "voice_id": self.options.voice_id, + "aws_access_key_id": self.options.access_key, + "aws_secret_access_key": self.options.secret_key, + "region_name": self.options.region, + "voice": self.options.voice_id, } + if self.options.engine is not None: + params["engine"] = self.options.engine result: Dict[str, Any] = {"vendor": "amazon", "params": params} if self.options.skip_patterns is not None: @@ -267,6 +287,11 @@ class HumeAITTSOptions(BaseModel): key: str = Field(..., description="Hume AI API key") config_id: Optional[str] = Field(default=None, description="Configuration ID") + voice_id: Optional[str] = Field(default=None, description="Hume AI voice ID") + base_url: Optional[str] = Field(default=None, description="Base URL") + provider: Optional[str] = Field(default=None, description="Voice provider type") + speed: Optional[float] = Field(default=None, description="Playback speed") + trailing_silence: Optional[float] = Field(default=None, description="Trailing silence in seconds") skip_patterns: Optional[List[int]] = Field(default=None) class HumeAITTS(BaseTTS): @@ -282,6 +307,16 @@ def to_config(self) -> Dict[str, Any]: if self.options.config_id is not None: params["config_id"] = self.options.config_id + if self.options.voice_id is not None: + params["voice_id"] = self.options.voice_id + if self.options.base_url is not None: + params["base_url"] = self.options.base_url + if self.options.provider is not None: + params["provider"] = self.options.provider + if self.options.speed is not None: + params["speed"] = self.options.speed + if self.options.trailing_silence is not None: + params["trailing_silence"] = self.options.trailing_silence result: Dict[str, Any] = {"vendor": "humeai", "params": params} if self.options.skip_patterns is not None: @@ -295,9 +330,7 @@ class RimeTTSOptions(BaseModel): key: str = Field(..., description="Rime API key") speaker: str = Field(..., description="Speaker ID") model_id: Optional[str] = Field(default=None, description="Model ID") - lang: Optional[str] = Field(default=None, description="Language code") - sampling_rate: Optional[int] = Field(default=None, description="Sampling rate in Hz") - speed_alpha: Optional[float] = Field(default=None, description="Speed multiplier") + base_url: Optional[str] = Field(default=None, description="WebSocket URL") skip_patterns: Optional[List[int]] = Field(default=None) class RimeTTS(BaseTTS): @@ -310,18 +343,14 @@ def sample_rate(self) -> Optional[int]: def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { - "key": self.options.key, + "api_key": self.options.key, "speaker": self.options.speaker, } if self.options.model_id is not None: - params["model_id"] = self.options.model_id - if self.options.lang is not None: - params["lang"] = self.options.lang - if self.options.sampling_rate is not None: - params["samplingRate"] = self.options.sampling_rate - if self.options.speed_alpha is not None: - params["speedAlpha"] = self.options.speed_alpha + params["modelId"] = self.options.model_id + if self.options.base_url is not None: + params["base_url"] = self.options.base_url result: Dict[str, Any] = {"vendor": "rime", "params": params} if self.options.skip_patterns is not None: @@ -334,6 +363,7 @@ class FishAudioTTSOptions(BaseModel): key: str = Field(..., description="Fish Audio API key") reference_id: str = Field(..., description="Reference ID") + backend: Optional[str] = Field(default=None, description="Backend") skip_patterns: Optional[List[int]] = Field(default=None) class FishAudioTTS(BaseTTS): @@ -346,9 +376,11 @@ def sample_rate(self) -> Optional[int]: def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { - "key": self.options.key, + "api_key": self.options.key, "reference_id": self.options.reference_id, } + if self.options.backend is not None: + params["backend"] = self.options.backend result: Dict[str, Any] = {"vendor": "fishaudio", "params": params} if self.options.skip_patterns is not None: @@ -397,6 +429,10 @@ class SarvamTTSOptions(BaseModel): key: str = Field(..., description="Sarvam API subscription key") speaker: str = Field(..., description="Speaker/voice ID (e.g., 'anushka', 'abhilash', 'karun', 'hitesh', 'manisha', 'vidya', 'arya')") target_language_code: str = Field(..., description="Target language code (e.g., 'en-IN', 'hi-IN', 'ta-IN')") + pitch: Optional[float] = Field(default=None, description="Pitch adjustment") + pace: Optional[float] = Field(default=None, description="Speed of speech") + loudness: Optional[float] = Field(default=None, description="Volume level") + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") skip_patterns: Optional[List[int]] = Field(default=None) class SarvamTTS(BaseTTS): @@ -409,10 +445,18 @@ def sample_rate(self) -> Optional[int]: def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { - "key": self.options.key, + "api_subscription_key": self.options.key, "speaker": self.options.speaker, "target_language_code": self.options.target_language_code, } + if self.options.pitch is not None: + params["pitch"] = self.options.pitch + if self.options.pace is not None: + params["pace"] = self.options.pace + if self.options.loudness is not None: + params["loudness"] = self.options.loudness + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate result: Dict[str, Any] = {"vendor": "sarvam", "params": params} if self.options.skip_patterns is not None: @@ -425,7 +469,13 @@ class MurfTTSOptions(BaseModel): key: str = Field(..., description="Murf API key") voice_id: str = Field(..., description="Voice ID (e.g., 'Ariana', 'Natalie', 'Ken')") + base_url: str = Field(..., description="WebSocket endpoint") style: Optional[str] = Field(default=None, description="Voice style (e.g., 'Angry', 'Sad', 'Conversational', 'Newscast')") + locale: Optional[str] = Field(default=None, description="Voice locale") + rate: Optional[float] = Field(default=None, description="Speech rate") + pitch: Optional[float] = Field(default=None, description="Pitch adjustment") + model: Optional[str] = Field(default=None, description="TTS model") + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate") skip_patterns: Optional[List[int]] = Field(default=None) class MurfTTS(BaseTTS): @@ -438,12 +488,23 @@ def sample_rate(self) -> Optional[int]: def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { - "key": self.options.key, - "voice_id": self.options.voice_id, + "api_key": self.options.key, + "base_url": self.options.base_url, + "voiceId": self.options.voice_id, } if self.options.style is not None: params["style"] = self.options.style + if self.options.locale is not None: + params["locale"] = self.options.locale + if self.options.rate is not None: + params["rate"] = self.options.rate + if self.options.pitch is not None: + params["pitch"] = self.options.pitch + if self.options.model is not None: + params["model"] = self.options.model + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate result: Dict[str, Any] = {"vendor": "murf", "params": params} if self.options.skip_patterns is not None: diff --git a/tests/custom/test_llm_vendors.py b/tests/custom/test_llm_vendors.py index faca9bf..c169d76 100644 --- a/tests/custom/test_llm_vendors.py +++ b/tests/custom/test_llm_vendors.py @@ -34,16 +34,19 @@ def test_vertex_ai_llm_includes_project_routing() -> None: assert config["params"]["location"] == "us-central1" -def test_amazon_bedrock_serializes_as_anthropic_style() -> None: +def test_amazon_bedrock_serializes_as_bedrock_style() -> None: config = AmazonBedrock( - api_key="bedrock-key", - url="https://bedrock.example.com/messages", + access_key="aws-access", + secret_key="aws-secret", + region="us-east-1", model="anthropic.claude-3-5-sonnet-20241022-v2:0", ).to_config() - assert config["api_key"] == "bedrock-key" - assert config["style"] == "anthropic" - assert config["params"]["model"] == "anthropic.claude-3-5-sonnet-20241022-v2:0" + assert config["access_key"] == "aws-access" + assert config["secret_key"] == "aws-secret" + assert config["region"] == "us-east-1" + assert config["model"] == "anthropic.claude-3-5-sonnet-20241022-v2:0" + assert config["style"] == "bedrock" def test_dify_serializes_conversation_fields() -> None: diff --git a/tests/custom/test_stt_language.py b/tests/custom/test_stt_language.py index 1573fc5..e04693f 100644 --- a/tests/custom/test_stt_language.py +++ b/tests/custom/test_stt_language.py @@ -67,7 +67,7 @@ def test_explicit_interaction_language_can_differ_from_provider_language() -> No def test_default_interaction_language_is_sent_without_stt() -> None: props = properties(base_agent()) - assert props["asr"] == {"language": "en-US"} + assert props["asr"]["language"] == "en-US" def test_stt_vendor_params_match_documented_shapes() -> None: @@ -84,6 +84,13 @@ def test_stt_vendor_params_match_documented_shapes() -> None: }, } + assert OpenAISTT(api_key="openai-key").to_config()["params"] == { + "api_key": "openai-key", + "input_audio_transcription": { + "model": "whisper-1", + }, + } + assert GoogleSTT( project_id="project", location="global", diff --git a/tests/custom/test_tts_vendors.py b/tests/custom/test_tts_vendors.py new file mode 100644 index 0000000..b9bc5d2 --- /dev/null +++ b/tests/custom/test_tts_vendors.py @@ -0,0 +1,49 @@ +from agora_agent import AmazonTTS, CartesiaTTS, FishAudioTTS, GoogleTTS, MurfTTS, RimeTTS, SarvamTTS + + +def test_tts_vendor_params_match_generated_core_shapes() -> None: + assert AmazonTTS(access_key="access", secret_key="secret", region="us-east-1", voice_id="Joanna", engine="neural").to_config()["params"] == { + "aws_access_key_id": "access", + "aws_secret_access_key": "secret", + "region_name": "us-east-1", + "voice": "Joanna", + "engine": "neural", + } + + assert GoogleTTS(key="{}", voice_name="en-US-JennyNeural", language_code="en-US", sample_rate_hertz=24000).to_config()["params"] == { + "credentials": "{}", + "VoiceSelectionParams": {"name": "en-US-JennyNeural", "language_code": "en-US"}, + "AudioConfig": {"sample_rate_hertz": 24000}, + } + + assert CartesiaTTS(api_key="cartesia-key", voice_id="voice", model_id="sonic-2", sample_rate=24000).to_config()["params"] == { + "api_key": "cartesia-key", + "model_id": "sonic-2", + "voice": {"mode": "id", "id": "voice"}, + "output_format": {"container": "raw", "sample_rate": 24000}, + } + + assert RimeTTS(key="rime-key", speaker="speaker", model_id="mist").to_config()["params"] == { + "api_key": "rime-key", + "speaker": "speaker", + "modelId": "mist", + } + + assert FishAudioTTS(key="fish-key", reference_id="ref", backend="speech-1.5").to_config()["params"] == { + "api_key": "fish-key", + "reference_id": "ref", + "backend": "speech-1.5", + } + + assert SarvamTTS(key="sarvam-key", speaker="anushka", target_language_code="en-IN", sample_rate=24000).to_config()["params"] == { + "api_subscription_key": "sarvam-key", + "speaker": "anushka", + "target_language_code": "en-IN", + "sample_rate": 24000, + } + + assert MurfTTS(key="murf-key", voice_id="Ariana", base_url="wss://murf.example/ws").to_config()["params"] == { + "api_key": "murf-key", + "base_url": "wss://murf.example/ws", + "voiceId": "Ariana", + } From 49af6f655eabf12183b980a30ece3ee23285a95a Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Mon, 1 Jun 2026 21:10:49 -0400 Subject: [PATCH 07/26] Align AgentKit TTS provider options with docs --- src/agora_agent/agentkit/vendors/tts.py | 30 +++++++++++-------------- tests/custom/test_tts_vendors.py | 4 ++++ 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/agora_agent/agentkit/vendors/tts.py b/src/agora_agent/agentkit/vendors/tts.py index fb3692a..b5c4b6a 100644 --- a/src/agora_agent/agentkit/vendors/tts.py +++ b/src/agora_agent/agentkit/vendors/tts.py @@ -219,7 +219,7 @@ class AmazonTTSOptions(BaseModel): secret_key: str = Field(..., description="AWS secret key") region: str = Field(..., description="AWS region (e.g., us-east-1)") voice_id: str = Field(..., description="Amazon Polly voice ID") - engine: Optional[str] = Field(default=None, description="Amazon Polly engine type") + engine: str = Field(..., description="Amazon Polly engine type") skip_patterns: Optional[List[int]] = Field(default=None) class AmazonTTS(BaseTTS): @@ -236,9 +236,8 @@ def to_config(self) -> Dict[str, Any]: "aws_secret_access_key": self.options.secret_key, "region_name": self.options.region, "voice": self.options.voice_id, + "engine": self.options.engine, } - if self.options.engine is not None: - params["engine"] = self.options.engine result: Dict[str, Any] = {"vendor": "amazon", "params": params} if self.options.skip_patterns is not None: @@ -329,7 +328,7 @@ class RimeTTSOptions(BaseModel): key: str = Field(..., description="Rime API key") speaker: str = Field(..., description="Speaker ID") - model_id: Optional[str] = Field(default=None, description="Model ID") + model_id: str = Field(..., description="Model ID") base_url: Optional[str] = Field(default=None, description="WebSocket URL") skip_patterns: Optional[List[int]] = Field(default=None) @@ -345,10 +344,8 @@ def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { "api_key": self.options.key, "speaker": self.options.speaker, + "modelId": self.options.model_id, } - - if self.options.model_id is not None: - params["modelId"] = self.options.model_id if self.options.base_url is not None: params["base_url"] = self.options.base_url @@ -363,7 +360,7 @@ class FishAudioTTSOptions(BaseModel): key: str = Field(..., description="Fish Audio API key") reference_id: str = Field(..., description="Reference ID") - backend: Optional[str] = Field(default=None, description="Backend") + backend: str = Field(..., description="Backend") skip_patterns: Optional[List[int]] = Field(default=None) class FishAudioTTS(BaseTTS): @@ -378,9 +375,8 @@ def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { "api_key": self.options.key, "reference_id": self.options.reference_id, + "backend": self.options.backend, } - if self.options.backend is not None: - params["backend"] = self.options.backend result: Dict[str, Any] = {"vendor": "fishaudio", "params": params} if self.options.skip_patterns is not None: @@ -468,8 +464,8 @@ class MurfTTSOptions(BaseModel): model_config = ConfigDict(extra="forbid") key: str = Field(..., description="Murf API key") - voice_id: str = Field(..., description="Voice ID (e.g., 'Ariana', 'Natalie', 'Ken')") - base_url: str = Field(..., description="WebSocket endpoint") + voice_id: Optional[str] = Field(default=None, description="Voice ID (e.g., 'Ariana', 'Natalie', 'Ken')") + base_url: Optional[str] = Field(default=None, description="WebSocket endpoint") style: Optional[str] = Field(default=None, description="Voice style (e.g., 'Angry', 'Sad', 'Conversational', 'Newscast')") locale: Optional[str] = Field(default=None, description="Voice locale") rate: Optional[float] = Field(default=None, description="Speech rate") @@ -487,12 +483,12 @@ def sample_rate(self) -> Optional[int]: return None def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = { - "api_key": self.options.key, - "base_url": self.options.base_url, - "voiceId": self.options.voice_id, - } + params: Dict[str, Any] = {"api_key": self.options.key} + if self.options.base_url is not None: + params["base_url"] = self.options.base_url + if self.options.voice_id is not None: + params["voiceId"] = self.options.voice_id if self.options.style is not None: params["style"] = self.options.style if self.options.locale is not None: diff --git a/tests/custom/test_tts_vendors.py b/tests/custom/test_tts_vendors.py index b9bc5d2..f87f4e9 100644 --- a/tests/custom/test_tts_vendors.py +++ b/tests/custom/test_tts_vendors.py @@ -47,3 +47,7 @@ def test_tts_vendor_params_match_generated_core_shapes() -> None: "base_url": "wss://murf.example/ws", "voiceId": "Ariana", } + + assert MurfTTS(key="murf-key").to_config()["params"] == { + "api_key": "murf-key", + } From bad47d966a8cbdd2a1824671bc72157c5fde3c91 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Mon, 1 Jun 2026 21:41:26 -0400 Subject: [PATCH 08/26] Align AgentKit provider BYOK parameter requirements --- src/agora_agent/agentkit/vendors/llm.py | 34 +++++++++----- src/agora_agent/agentkit/vendors/tts.py | 61 +++++++++++++++++++------ tests/custom/test_llm_vendors.py | 38 ++++++++++++++- tests/custom/test_stt_language.py | 4 +- tests/custom/test_tts_vendors.py | 50 +++++++++++++++++++- 5 files changed, 155 insertions(+), 32 deletions(-) diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py index 3617879..ee6d3af 100644 --- a/src/agora_agent/agentkit/vendors/llm.py +++ b/src/agora_agent/agentkit/vendors/llm.py @@ -1,6 +1,6 @@ from typing import Any, Dict, List, Optional, Union -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, model_validator from .base import BaseLLM @@ -47,6 +47,14 @@ class OpenAIOptions(BaseModel): mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + @model_validator(mode="after") + def _validate_byok_params(self) -> "OpenAIOptions": + if self.api_key is not None and self.base_url is None: + raise ValueError("OpenAI requires base_url when api_key is set") + if self.api_key is None and self.base_url is not None: + raise ValueError("OpenAI base_url is only valid when api_key is set") + return self + class OpenAI(BaseLLM): def __init__(self, **kwargs: Any): self.options = OpenAIOptions(**kwargs) @@ -101,6 +109,7 @@ class AzureOpenAIOptions(BaseModel): model_config = ConfigDict(extra="forbid") api_key: str = Field(..., description="Azure OpenAI API key") + model: str = Field(..., description="Azure deployment model name") endpoint: str = Field(..., description="Azure endpoint URL") deployment_name: str = Field(..., description="Azure deployment name") api_version: str = Field(default="2024-08-01-preview", description="Azure API version") @@ -139,7 +148,7 @@ def to_config(self) -> Dict[str, Any]: } # Named fields take precedence over anything in the generic params dict. - params: Dict[str, Any] = dict(self.options.params or {}) + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} if self.options.temperature is not None: params["temperature"] = self.options.temperature if self.options.top_p is not None: @@ -176,8 +185,8 @@ class AnthropicOptions(BaseModel): api_key: str = Field(..., description="Anthropic API key") model: str = Field(default="claude-3-5-sonnet-20241022", description="Model name") - url: Optional[str] = Field(default=None, description="Custom API endpoint URL") - max_tokens: Optional[int] = Field(default=None, gt=0) + url: str = Field(..., description="Anthropic messages endpoint URL") + max_tokens: int = Field(..., gt=0) temperature: Optional[float] = Field(default=None, ge=0.0, le=1.0) top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) @@ -185,7 +194,7 @@ class AnthropicOptions(BaseModel): failure_message: Optional[str] = Field(default=None) input_modalities: Optional[List[str]] = Field(default=None) params: Optional[Dict[str, Any]] = Field(default=None) - headers: Optional[Dict[str, str]] = Field(default=None) + headers: Dict[str, str] = Field(..., description="Anthropic request headers") output_modalities: Optional[List[str]] = Field(default=None) greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) template_variables: Optional[Dict[str, str]] = Field(default=None) @@ -208,17 +217,16 @@ def to_config(self) -> Dict[str, Any]: params["top_p"] = self.options.top_p config: Dict[str, Any] = { - "url": self.options.url or "https://api.anthropic.com/v1/messages", + "url": self.options.url, "api_key": self.options.api_key, "params": params, + "headers": self.options.headers, "style": "anthropic", "input_modalities": self.options.input_modalities or ["text"], } if self.options.system_messages is not None: config["system_messages"] = self.options.system_messages - if self.options.headers is not None: - config["headers"] = self.options.headers if self.options.greeting_message is not None: config["greeting_message"] = self.options.greeting_message if self.options.failure_message is not None: @@ -315,7 +323,7 @@ class GroqOptions(OpenAIOptions): api_key: str = Field(..., description="Groq API key") model: str = Field(default="llama-3.3-70b-versatile", description="Model name") - base_url: Optional[str] = Field(default=None, description="Custom Groq-compatible endpoint") + base_url: str = Field(..., description="Groq-compatible endpoint") class Groq(BaseLLM): @@ -324,7 +332,7 @@ def __init__(self, **kwargs: Any): def to_config(self) -> Dict[str, Any]: config = OpenAI(**_dump_optional_model(self.options)).to_config() - config["url"] = self.options.base_url or "https://api.groq.com/openai/v1/chat/completions" + config["url"] = self.options.base_url return config @@ -375,8 +383,10 @@ class AmazonBedrockOptions(AnthropicOptions): access_key: str = Field(..., description="AWS access key ID") secret_key: str = Field(..., description="AWS secret access key") region: str = Field(..., description="AWS region") + max_tokens: Optional[int] = Field(default=None, gt=0) api_key: Optional[str] = Field(default=None, description="Unused; kept for AnthropicOptions compatibility") url: Optional[str] = Field(default=None, description="Unused; kept for AnthropicOptions compatibility") + headers: Optional[Dict[str, str]] = Field(default=None) class AmazonBedrock(BaseLLM): @@ -393,6 +403,7 @@ def to_config(self) -> Dict[str, Any]: params["top_p"] = self.options.top_p config: Dict[str, Any] = { + "url": self.options.url or f"https://bedrock-runtime.{self.options.region}.amazonaws.com/model/{self.options.model}/converse-stream", "access_key": self.options.access_key, "secret_key": self.options.secret_key, "region": self.options.region, @@ -429,6 +440,7 @@ class DifyOptions(BaseModel): api_key: str = Field(..., description="Dify API key") url: str = Field(..., description="Dify workflow or chat endpoint") + model: str = Field(..., description="Dify model identifier") user: Optional[str] = Field(default=None, description="Dify user identifier") conversation_id: Optional[str] = Field(default=None, description="Dify conversation ID") system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) @@ -450,7 +462,7 @@ def __init__(self, **kwargs: Any): self.options = DifyOptions(**kwargs) def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = dict(self.options.params or {}) + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} if self.options.user is not None: params["user"] = self.options.user if self.options.conversation_id is not None: diff --git a/src/agora_agent/agentkit/vendors/tts.py b/src/agora_agent/agentkit/vendors/tts.py index b5c4b6a..61ceb2e 100644 --- a/src/agora_agent/agentkit/vendors/tts.py +++ b/src/agora_agent/agentkit/vendors/tts.py @@ -1,17 +1,16 @@ from typing import Any, Dict, List, Optional -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, model_validator from .base import BaseTTS, CartesiaSampleRate, ElevenLabsSampleRate, GoogleTTSSampleRate, MicrosoftSampleRate - class ElevenLabsTTSOptions(BaseModel): model_config = ConfigDict(extra="forbid") key: str = Field(..., description="ElevenLabs API key") model_id: str = Field(..., description="Model ID (e.g., eleven_flash_v2_5)") voice_id: str = Field(..., description="Voice ID") - base_url: Optional[str] = Field(default=None, description="WebSocket base URL") + base_url: str = Field(..., description="WebSocket base URL") sample_rate: Optional[ElevenLabsSampleRate] = Field(default=None, description="Sample rate in Hz") skip_patterns: Optional[List[int]] = Field(default=None) optimize_streaming_latency: Optional[int] = Field(default=None, ge=0, le=4) @@ -31,12 +30,11 @@ def sample_rate(self) -> Optional[int]: def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { "key": self.options.key, + "base_url": self.options.base_url, "model_id": self.options.model_id, "voice_id": self.options.voice_id, } - if self.options.base_url is not None: - params["base_url"] = self.options.base_url if self.options.sample_rate is not None: params["sample_rate"] = self.options.sample_rate if self.options.optimize_streaming_latency is not None: @@ -107,6 +105,23 @@ class OpenAITTSOptions(BaseModel): speed: Optional[float] = Field(default=None, description="Speech speed multiplier") skip_patterns: Optional[List[int]] = Field(default=None) + @model_validator(mode="after") + def _validate_byok_params(self) -> "OpenAITTSOptions": + if self.api_key is not None: + missing = [ + name + for name, value in ( + ("model", self.model), + ("base_url", self.base_url), + ) + if value is None + ] + if missing: + raise ValueError(f"OpenAITTS requires {', '.join(missing)} when api_key is set") + elif self.base_url is not None: + raise ValueError("OpenAITTS base_url is only valid when api_key is set") + return self + class OpenAITTS(BaseTTS): def __init__(self, **kwargs: Any): self.options = OpenAITTSOptions(**kwargs) @@ -121,11 +136,11 @@ def to_config(self) -> Dict[str, Any]: } if self.options.api_key is not None: params["api_key"] = self.options.api_key - if self.options.base_url is not None: params["base_url"] = self.options.base_url - - if self.options.model is not None: params["model"] = self.options.model + elif self.options.model is not None: + params["model"] = self.options.model + if self.options.response_format is not None: params["response_format"] = self.options.response_format if self.options.instructions is not None: @@ -286,9 +301,9 @@ class HumeAITTSOptions(BaseModel): key: str = Field(..., description="Hume AI API key") config_id: Optional[str] = Field(default=None, description="Configuration ID") - voice_id: Optional[str] = Field(default=None, description="Hume AI voice ID") + voice_id: str = Field(..., description="Hume AI voice ID") base_url: Optional[str] = Field(default=None, description="Base URL") - provider: Optional[str] = Field(default=None, description="Voice provider type") + provider: str = Field(..., description="Voice provider type") speed: Optional[float] = Field(default=None, description="Playback speed") trailing_silence: Optional[float] = Field(default=None, description="Trailing silence in seconds") skip_patterns: Optional[List[int]] = Field(default=None) @@ -302,16 +317,16 @@ def sample_rate(self) -> Optional[int]: return None def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = {"key": self.options.key} + params: Dict[str, Any] = { + "key": self.options.key, + "voice_id": self.options.voice_id, + "provider": self.options.provider, + } if self.options.config_id is not None: params["config_id"] = self.options.config_id - if self.options.voice_id is not None: - params["voice_id"] = self.options.voice_id if self.options.base_url is not None: params["base_url"] = self.options.base_url - if self.options.provider is not None: - params["provider"] = self.options.provider if self.options.speed is not None: params["speed"] = self.options.speed if self.options.trailing_silence is not None: @@ -394,6 +409,22 @@ class MiniMaxTTSOptions(BaseModel): url: Optional[str] = Field(default=None, description="WebSocket endpoint (e.g., 'wss://api-uw.minimax.io/ws/v1/t2a_v2')") skip_patterns: Optional[List[int]] = Field(default=None) + @model_validator(mode="after") + def _validate_byok_params(self) -> "MiniMaxTTSOptions": + if self.key is not None: + missing = [ + name + for name, value in ( + ("group_id", self.group_id), + ("voice_id", self.voice_id), + ("url", self.url), + ) + if value is None + ] + if missing: + raise ValueError(f"MiniMaxTTS requires {', '.join(missing)} when key is set") + return self + class MiniMaxTTS(BaseTTS): def __init__(self, **kwargs: Any): self.options = MiniMaxTTSOptions(**kwargs) diff --git a/tests/custom/test_llm_vendors.py b/tests/custom/test_llm_vendors.py index c169d76..bcde1bf 100644 --- a/tests/custom/test_llm_vendors.py +++ b/tests/custom/test_llm_vendors.py @@ -1,8 +1,8 @@ -from agora_agent import AmazonBedrock, CustomLLM, Dify, Groq, VertexAILLM +from agora_agent import AmazonBedrock, Anthropic, AzureOpenAI, CustomLLM, Dify, Groq, VertexAILLM def test_groq_serializes_as_openai_compatible() -> None: - config = Groq(api_key="groq-key", model="llama-3.3-70b-versatile").to_config() + config = Groq(api_key="groq-key", model="llama-3.3-70b-versatile", base_url="https://api.groq.com/openai/v1/chat/completions").to_config() assert config["url"] == "https://api.groq.com/openai/v1/chat/completions" assert config["api_key"] == "groq-key" @@ -19,6 +19,37 @@ def test_custom_llm_marks_request_as_custom() -> None: assert config["style"] == "openai" +def test_anthropic_serializes_required_claude_fields() -> None: + config = Anthropic( + api_key="anthropic-key", + model="claude-3-5-sonnet-20241022", + url="https://api.anthropic.com/v1/messages", + headers={"anthropic-version": "2023-06-01"}, + max_tokens=1024, + ).to_config() + + assert config["url"] == "https://api.anthropic.com/v1/messages" + assert config["api_key"] == "anthropic-key" + assert config["style"] == "anthropic" + assert config["headers"]["anthropic-version"] == "2023-06-01" + assert config["params"]["model"] == "claude-3-5-sonnet-20241022" + assert config["params"]["max_tokens"] == 1024 + + +def test_azure_openai_includes_required_model_param() -> None: + config = AzureOpenAI( + api_key="azure-key", + endpoint="https://example.openai.azure.com", + deployment_name="deployment", + model="gpt-4o", + ).to_config() + + assert config["api_key"] == "azure-key" + assert config["vendor"] == "azure" + assert config["style"] == "openai" + assert config["params"]["model"] == "gpt-4o" + + def test_vertex_ai_llm_includes_project_routing() -> None: config = VertexAILLM( api_key="vertex-token", @@ -45,6 +76,7 @@ def test_amazon_bedrock_serializes_as_bedrock_style() -> None: assert config["access_key"] == "aws-access" assert config["secret_key"] == "aws-secret" assert config["region"] == "us-east-1" + assert config["url"] == "https://bedrock-runtime.us-east-1.amazonaws.com/model/anthropic.claude-3-5-sonnet-20241022-v2:0/converse-stream" assert config["model"] == "anthropic.claude-3-5-sonnet-20241022-v2:0" assert config["style"] == "bedrock" @@ -53,11 +85,13 @@ def test_dify_serializes_conversation_fields() -> None: config = Dify( api_key="dify-key", url="https://api.dify.ai/v1/chat-messages", + model="default", user="user-1", conversation_id="conversation-1", ).to_config() assert config["api_key"] == "dify-key" assert config["style"] == "dify" + assert config["params"]["model"] == "default" assert config["params"]["user"] == "user-1" assert config["params"]["conversation_id"] == "conversation-1" diff --git a/tests/custom/test_stt_language.py b/tests/custom/test_stt_language.py index e04693f..1ab6e80 100644 --- a/tests/custom/test_stt_language.py +++ b/tests/custom/test_stt_language.py @@ -20,8 +20,8 @@ def dump(value): def base_agent() -> Agent: return ( Agent() - .with_llm(OpenAI(api_key="llm-key", model="gpt-4o-mini")) - .with_tts(ElevenLabsTTS(key="tts-key", voice_id="voice", model_id="eleven_flash_v2_5")) + .with_llm(OpenAI(api_key="llm-key", model="gpt-4o-mini", base_url="https://api.openai.com/v1/chat/completions")) + .with_tts(ElevenLabsTTS(key="tts-key", voice_id="voice", model_id="eleven_flash_v2_5", base_url="wss://api.elevenlabs.io/v1")) ) diff --git a/tests/custom/test_tts_vendors.py b/tests/custom/test_tts_vendors.py index f87f4e9..ec3bbbf 100644 --- a/tests/custom/test_tts_vendors.py +++ b/tests/custom/test_tts_vendors.py @@ -1,4 +1,4 @@ -from agora_agent import AmazonTTS, CartesiaTTS, FishAudioTTS, GoogleTTS, MurfTTS, RimeTTS, SarvamTTS +from agora_agent import AmazonTTS, CartesiaTTS, ElevenLabsTTS, FishAudioTTS, GoogleTTS, HumeAITTS, MiniMaxTTS, MurfTTS, OpenAITTS, RimeTTS, SarvamTTS def test_tts_vendor_params_match_generated_core_shapes() -> None: @@ -35,6 +35,38 @@ def test_tts_vendor_params_match_generated_core_shapes() -> None: "backend": "speech-1.5", } + assert ElevenLabsTTS(key="eleven-key", model_id="eleven_flash_v2_5", voice_id="voice", base_url="wss://api.elevenlabs.io/v1").to_config()["params"] == { + "key": "eleven-key", + "base_url": "wss://api.elevenlabs.io/v1", + "model_id": "eleven_flash_v2_5", + "voice_id": "voice", + } + + assert OpenAITTS(api_key="openai-key", voice="coral", model="gpt-4o-mini-tts", base_url="https://api.openai.com/v1").to_config()["params"] == { + "voice": "coral", + "api_key": "openai-key", + "base_url": "https://api.openai.com/v1", + "model": "gpt-4o-mini-tts", + } + + assert OpenAITTS(voice="coral").to_config()["params"] == { + "voice": "coral", + } + + assert HumeAITTS(key="hume-key", voice_id="voice", provider="CUSTOM_VOICE").to_config()["params"] == { + "key": "hume-key", + "voice_id": "voice", + "provider": "CUSTOM_VOICE", + } + + assert MiniMaxTTS(key="minimax-key", group_id="group", model="speech-02-turbo", voice_id="voice", url="wss://api-uw.minimax.io/ws/v1/t2a_v2").to_config()["params"] == { + "model": "speech-02-turbo", + "key": "minimax-key", + "group_id": "group", + "voice_setting": {"voice_id": "voice"}, + "url": "wss://api-uw.minimax.io/ws/v1/t2a_v2", + } + assert SarvamTTS(key="sarvam-key", speaker="anushka", target_language_code="en-IN", sample_rate=24000).to_config()["params"] == { "api_subscription_key": "sarvam-key", "speaker": "anushka", @@ -42,10 +74,24 @@ def test_tts_vendor_params_match_generated_core_shapes() -> None: "sample_rate": 24000, } - assert MurfTTS(key="murf-key", voice_id="Ariana", base_url="wss://murf.example/ws").to_config()["params"] == { + assert MurfTTS( + key="murf-key", + voice_id="Ariana", + base_url="wss://murf.example/ws", + locale="en-US", + rate=0, + pitch=0, + model="FALCON", + sample_rate=24000, + ).to_config()["params"] == { "api_key": "murf-key", "base_url": "wss://murf.example/ws", "voiceId": "Ariana", + "locale": "en-US", + "rate": 0, + "pitch": 0, + "model": "FALCON", + "sample_rate": 24000, } assert MurfTTS(key="murf-key").to_config()["params"] == { From 477f40aa3df8b8d586a647685cf0e9686d7ce16f Mon Sep 17 00:00:00 2001 From: fern-api <115122769+fern-api[bot]@users.noreply.github.com> Date: Tue, 2 Jun 2026 01:43:39 +0000 Subject: [PATCH 09/26] [fern-generated] Update SDK Generated by Fern CLI Version: unknown Generators: - fernapi/fern-python-sdk: 4.37.0 --- reference.md | 21 ++- src/agora_agent/agents/client.py | 42 +++-- .../types/start_agents_request_properties.py | 12 +- .../start_agents_request_properties_asr.py | 47 ----- ...rt_agents_request_properties_asr_vendor.py | 10 - .../start_agents_request_properties_llm.py | 115 ------------ ...request_properties_llm_greeting_configs.py | 43 ----- ...st_properties_llm_greeting_configs_mode.py | 7 - ...request_properties_llm_mcp_servers_item.py | 54 ------ ...art_agents_request_properties_llm_style.py | 5 - .../start_agents_request_properties_mllm.py | 86 --------- ..._request_properties_mllm_turn_detection.py | 61 ------- ...es_mllm_turn_detection_agora_vad_config.py | 42 ----- ...est_properties_mllm_turn_detection_mode.py | 7 - ...mllm_turn_detection_semantic_vad_config.py | 32 ---- ...detection_semantic_vad_config_eagerness.py | 7 - ...s_mllm_turn_detection_server_vad_config.py | 62 ------- ...t_agents_request_properties_mllm_vendor.py | 5 - src/agora_agent/core/client_wrapper.py | 4 +- src/agora_agent/types/amazon_asr.py | 27 +++ src/agora_agent/types/amazon_asr_params.py | 52 ++++++ src/agora_agent/types/amazon_tts_params.py | 16 +- .../types/amazon_tts_params_engine.py | 5 + src/agora_agent/types/ares_asr.py | 27 +++ src/agora_agent/types/ares_asr_params.py | 5 + src/agora_agent/types/asr.py | 172 ++++++++++++++++++ src/agora_agent/types/asr_language.py | 41 +++++ src/agora_agent/types/assembly_ai_asr.py | 27 +++ .../types/assembly_ai_asr_params.py | 37 ++++ .../types/cartesia_tts_output_format.py | 32 ++++ src/agora_agent/types/cartesia_tts_params.py | 17 +- src/agora_agent/types/deepgram_asr.py | 31 ++++ src/agora_agent/types/deepgram_asr_params.py | 47 +++++ .../types/eleven_labs_tts_params.py | 27 ++- .../types/fish_audio_tts_params.py | 7 +- src/agora_agent/types/google_asr.py | 27 +++ src/agora_agent/types/google_asr_params.py | 47 +++++ .../types/google_tts_audio_config.py | 32 ++++ src/agora_agent/types/google_tts_params.py | 28 ++- .../google_tts_voice_selection_params.py | 27 +++ src/agora_agent/types/hume_ai_tts_params.py | 28 ++- .../types/hume_ai_tts_params_provider.py | 5 + src/agora_agent/types/llm.py | 120 ++++++++++++ src/agora_agent/types/llm_params.py | 32 ++++ src/agora_agent/types/llm_style.py | 5 + src/agora_agent/types/microsoft_asr.py | 27 +++ src/agora_agent/types/microsoft_asr_params.py | 42 +++++ src/agora_agent/types/microsoft_tts_params.py | 10 + src/agora_agent/types/mllm.py | 88 +++++++++ src/agora_agent/types/mllm_http_options.py | 27 +++ .../types/mllm_input_audio_transcription.py | 37 ++++ src/agora_agent/types/mllm_params.py | 71 ++++++++ src/agora_agent/types/mllm_turn_detection.py | 35 ++++ .../mllm_turn_detection_agora_vad_config.py | 23 +++ .../types/mllm_turn_detection_mode.py | 5 + ...mllm_turn_detection_semantic_vad_config.py | 21 +++ ...detection_semantic_vad_config_eagerness.py | 5 + .../mllm_turn_detection_server_vad_config.py | 31 ++++ ...r_vad_config_end_of_speech_sensitivity.py} | 2 +- ...vad_config_start_of_speech_sensitivity.py} | 2 +- src/agora_agent/types/mllm_vendor.py | 5 + src/agora_agent/types/murf_tts_params.py | 39 +++- src/agora_agent/types/open_ai_asr.py | 27 +++ src/agora_agent/types/open_ai_asr_params.py | 30 +++ .../open_ai_input_audio_transcription.py | 37 ++++ src/agora_agent/types/open_ai_tts_params.py | 17 +- src/agora_agent/types/rime_tts_params.py | 13 +- src/agora_agent/types/sarvam_asr.py | 27 +++ src/agora_agent/types/sarvam_asr_params.py | 32 ++++ src/agora_agent/types/sarvam_tts_params.py | 25 ++- .../sarvam_tts_params_target_language_code.py | 8 + src/agora_agent/types/speechmatics_asr.py | 27 +++ .../types/speechmatics_asr_params.py | 37 ++++ tests/custom/test_avatar_token.py | 12 -- tests/custom/test_llm_vendors.py | 60 ------ tests/custom/test_root_exports.py | 29 --- 76 files changed, 1676 insertions(+), 758 deletions(-) delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_asr.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_asr_vendor.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_llm.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs_mode.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_llm_mcp_servers_item.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_llm_style.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_agora_vad_config.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_mode.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py create mode 100644 src/agora_agent/types/amazon_asr.py create mode 100644 src/agora_agent/types/amazon_asr_params.py create mode 100644 src/agora_agent/types/amazon_tts_params_engine.py create mode 100644 src/agora_agent/types/ares_asr.py create mode 100644 src/agora_agent/types/ares_asr_params.py create mode 100644 src/agora_agent/types/asr.py create mode 100644 src/agora_agent/types/asr_language.py create mode 100644 src/agora_agent/types/assembly_ai_asr.py create mode 100644 src/agora_agent/types/assembly_ai_asr_params.py create mode 100644 src/agora_agent/types/cartesia_tts_output_format.py create mode 100644 src/agora_agent/types/deepgram_asr.py create mode 100644 src/agora_agent/types/deepgram_asr_params.py create mode 100644 src/agora_agent/types/google_asr.py create mode 100644 src/agora_agent/types/google_asr_params.py create mode 100644 src/agora_agent/types/google_tts_audio_config.py create mode 100644 src/agora_agent/types/google_tts_voice_selection_params.py create mode 100644 src/agora_agent/types/hume_ai_tts_params_provider.py create mode 100644 src/agora_agent/types/llm.py create mode 100644 src/agora_agent/types/llm_params.py create mode 100644 src/agora_agent/types/llm_style.py create mode 100644 src/agora_agent/types/microsoft_asr.py create mode 100644 src/agora_agent/types/microsoft_asr_params.py create mode 100644 src/agora_agent/types/mllm.py create mode 100644 src/agora_agent/types/mllm_http_options.py create mode 100644 src/agora_agent/types/mllm_input_audio_transcription.py create mode 100644 src/agora_agent/types/mllm_params.py create mode 100644 src/agora_agent/types/mllm_turn_detection.py create mode 100644 src/agora_agent/types/mllm_turn_detection_agora_vad_config.py create mode 100644 src/agora_agent/types/mllm_turn_detection_mode.py create mode 100644 src/agora_agent/types/mllm_turn_detection_semantic_vad_config.py create mode 100644 src/agora_agent/types/mllm_turn_detection_semantic_vad_config_eagerness.py create mode 100644 src/agora_agent/types/mllm_turn_detection_server_vad_config.py rename src/agora_agent/{agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py => types/mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py} (61%) rename src/agora_agent/{agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py => types/mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py} (61%) create mode 100644 src/agora_agent/types/mllm_vendor.py create mode 100644 src/agora_agent/types/open_ai_asr.py create mode 100644 src/agora_agent/types/open_ai_asr_params.py create mode 100644 src/agora_agent/types/open_ai_input_audio_transcription.py create mode 100644 src/agora_agent/types/sarvam_asr.py create mode 100644 src/agora_agent/types/sarvam_asr_params.py create mode 100644 src/agora_agent/types/sarvam_tts_params_target_language_code.py create mode 100644 src/agora_agent/types/speechmatics_asr.py create mode 100644 src/agora_agent/types/speechmatics_asr_params.py delete mode 100644 tests/custom/test_avatar_token.py delete mode 100644 tests/custom/test_llm_vendors.py delete mode 100644 tests/custom/test_root_exports.py diff --git a/reference.md b/reference.md index 55a516e..57fc92a 100644 --- a/reference.md +++ b/reference.md @@ -27,11 +27,16 @@ Create and start a Conversational AI agent instance.
```python -from agora_agent import Agora, MicrosoftTtsParams, Tts_Microsoft +from agora_agent import ( + Agora, + Asr_Ares, + Llm, + LlmParams, + MicrosoftTtsParams, + Tts_Microsoft, +) from agora_agent.agents import ( StartAgentsRequestProperties, - StartAgentsRequestPropertiesAsr, - StartAgentsRequestPropertiesLlm, StartAgentsRequestPropertiesTurnDetection, StartAgentsRequestPropertiesTurnDetectionConfig, StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, @@ -51,9 +56,7 @@ client.agents.start( agent_rtc_uid="1001", remote_rtc_uids=["1002"], idle_timeout=120, - asr=StartAgentsRequestPropertiesAsr( - language="en-US", - ), + asr=Asr_Ares(), tts=Tts_Microsoft( params=MicrosoftTtsParams( key="key", @@ -61,13 +64,15 @@ client.agents.start( voice_name="voice_name", ), ), - llm=StartAgentsRequestPropertiesLlm( + llm=Llm( url="https://api.openai.com/v1/chat/completions", api_key="", system_messages=[ {"role": "system", "content": "You are a helpful chatbot."} ], - params={"model": "gpt-4o-mini"}, + params=LlmParams( + model="gpt-4o-mini", + ), max_history=32, greeting_message="Hello, how can I assist you today?", failure_message="Please hold on a second.", diff --git a/src/agora_agent/agents/client.py b/src/agora_agent/agents/client.py index 3f6af4c..e923c9a 100644 --- a/src/agora_agent/agents/client.py +++ b/src/agora_agent/agents/client.py @@ -84,11 +84,16 @@ def start( Examples -------- - from agora_agent import Agora, MicrosoftTtsParams, Tts_Microsoft + from agora_agent import ( + Agora, + Asr_Ares, + Llm, + LlmParams, + MicrosoftTtsParams, + Tts_Microsoft, + ) from agora_agent.agents import ( StartAgentsRequestProperties, - StartAgentsRequestPropertiesAsr, - StartAgentsRequestPropertiesLlm, StartAgentsRequestPropertiesTurnDetection, StartAgentsRequestPropertiesTurnDetectionConfig, StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, @@ -108,9 +113,7 @@ def start( agent_rtc_uid="1001", remote_rtc_uids=["1002"], idle_timeout=120, - asr=StartAgentsRequestPropertiesAsr( - language="en-US", - ), + asr=Asr_Ares(), tts=Tts_Microsoft( params=MicrosoftTtsParams( key="key", @@ -118,13 +121,15 @@ def start( voice_name="voice_name", ), ), - llm=StartAgentsRequestPropertiesLlm( + llm=Llm( url="https://api.openai.com/v1/chat/completions", api_key="", system_messages=[ {"role": "system", "content": "You are a helpful chatbot."} ], - params={"model": "gpt-4o-mini"}, + params=LlmParams( + model="gpt-4o-mini", + ), max_history=32, greeting_message="Hello, how can I assist you today?", failure_message="Please hold on a second.", @@ -641,11 +646,16 @@ async def start( -------- import asyncio - from agora_agent import AsyncAgora, MicrosoftTtsParams, Tts_Microsoft + from agora_agent import ( + Asr_Ares, + AsyncAgora, + Llm, + LlmParams, + MicrosoftTtsParams, + Tts_Microsoft, + ) from agora_agent.agents import ( StartAgentsRequestProperties, - StartAgentsRequestPropertiesAsr, - StartAgentsRequestPropertiesLlm, StartAgentsRequestPropertiesTurnDetection, StartAgentsRequestPropertiesTurnDetectionConfig, StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, @@ -668,9 +678,7 @@ async def main() -> None: agent_rtc_uid="1001", remote_rtc_uids=["1002"], idle_timeout=120, - asr=StartAgentsRequestPropertiesAsr( - language="en-US", - ), + asr=Asr_Ares(), tts=Tts_Microsoft( params=MicrosoftTtsParams( key="key", @@ -678,13 +686,15 @@ async def main() -> None: voice_name="voice_name", ), ), - llm=StartAgentsRequestPropertiesLlm( + llm=Llm( url="https://api.openai.com/v1/chat/completions", api_key="", system_messages=[ {"role": "system", "content": "You are a helpful chatbot."} ], - params={"model": "gpt-4o-mini"}, + params=LlmParams( + model="gpt-4o-mini", + ), max_history=32, greeting_message="Hello, how can I assist you today?", failure_message="Please hold on a second.", diff --git a/src/agora_agent/agents/types/start_agents_request_properties.py b/src/agora_agent/agents/types/start_agents_request_properties.py index 06c3482..3cddb7e 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties.py +++ b/src/agora_agent/agents/types/start_agents_request_properties.py @@ -5,15 +5,15 @@ import pydantic from ...core.pydantic_utilities import IS_PYDANTIC_V2 from ...core.unchecked_base_model import UncheckedBaseModel +from ...types.asr import Asr +from ...types.llm import Llm +from ...types.mllm import Mllm from ...types.tts import Tts from .start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures -from .start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr from .start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar from .start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords from .start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence from .start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption -from .start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm -from .start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm from .start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters from .start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc from .start_agents_request_properties_sal import StartAgentsRequestPropertiesSal @@ -67,7 +67,7 @@ class StartAgentsRequestProperties(UncheckedBaseModel): Advanced features configuration. """ - asr: typing.Optional[StartAgentsRequestPropertiesAsr] = pydantic.Field(default=None) + asr: typing.Optional[Asr] = pydantic.Field(default=None) """ Automatic Speech Recognition (ASR) configuration. """ @@ -77,12 +77,12 @@ class StartAgentsRequestProperties(UncheckedBaseModel): Text-to-speech (TTS) module configuration. """ - llm: typing.Optional[StartAgentsRequestPropertiesLlm] = pydantic.Field(default=None) + llm: typing.Optional[Llm] = pydantic.Field(default=None) """ Large language model (LLM) configuration. """ - mllm: typing.Optional[StartAgentsRequestPropertiesMllm] = pydantic.Field(default=None) + mllm: typing.Optional[Mllm] = pydantic.Field(default=None) """ Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing. `mllm` is an exclusive alternative to the standard `asr` + `llm` + `tts` pipeline. """ diff --git a/src/agora_agent/agents/types/start_agents_request_properties_asr.py b/src/agora_agent/agents/types/start_agents_request_properties_asr.py deleted file mode 100644 index 7385e17..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_asr.py +++ /dev/null @@ -1,47 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor - - -class StartAgentsRequestPropertiesAsr(UncheckedBaseModel): - """ - Automatic Speech Recognition (ASR) configuration. - """ - - language: typing.Optional[str] = pydantic.Field(default=None) - """ - The BCP-47 language tag identifying the primary language used for agent interaction. If `params` contains a vendor-specific language code, it takes precedence over this setting. - """ - - vendor: typing.Optional[StartAgentsRequestPropertiesAsrVendor] = pydantic.Field(default=None) - """ - ASR provider: - - `ares`: Adaptive Recognition Engine for Speech - - `microsoft`: Microsoft Azure - - `deepgram`: Deepgram - - `openai`: OpenAI (Beta) - - `speechmatics`: Speechmatics - - `assemblyai`: AssemblyAI (Beta) - - `amazon`: Amazon Transcribe (Beta) - - `google`: Google (Beta) - - `sarvam`: Sarvam (Beta) - """ - - params: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) - """ - The configuration parameters for the ASR vendor. See [ASR Overview](https://docs.agora.io/en/conversational-ai/models/asr/overview) for details. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_asr_vendor.py b/src/agora_agent/agents/types/start_agents_request_properties_asr_vendor.py deleted file mode 100644 index 973d62c..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_asr_vendor.py +++ /dev/null @@ -1,10 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesAsrVendor = typing.Union[ - typing.Literal[ - "ares", "microsoft", "deepgram", "openai", "google", "amazon", "assemblyai", "speechmatics", "sarvam" - ], - typing.Any, -] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm.py b/src/agora_agent/agents/types/start_agents_request_properties_llm.py deleted file mode 100644 index 9ab0f62..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm.py +++ /dev/null @@ -1,115 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs -from .start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem -from .start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle - - -class StartAgentsRequestPropertiesLlm(UncheckedBaseModel): - """ - Large language model (LLM) configuration. - """ - - url: str = pydantic.Field() - """ - The LLM callback address. - """ - - api_key: typing.Optional[str] = pydantic.Field(default=None) - """ - The LLM verification API key. The default value is an empty string. Ensure that you enable the API key in a production environment. - """ - - system_messages: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) - """ - A set of predefined information used as input to the LLM, including prompt words and examples. - """ - - params: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) - """ - Additional LLM configuration parameters, such as the `model` used, and the maximum token limit. For details about each supported LLM, refer to [Supported LLMs](https://docs.agora.io/en/conversational-ai/models/llm/overview#supported-llms). - """ - - max_history: typing.Optional[int] = pydantic.Field(default=None) - """ - The number of conversation history messages cached in the custom LLM. History includes user and agent dialog messages, tool call information, and timestamps. Agent and user messages are recorded separately. - """ - - input_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - LLM input modalities: - - `["text"]`: Text only - - `["text", "image"]`: Text plus image. Recommended configuration, requires the selected LLM to support visual input - """ - - output_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - LLM output modalities: - - `["text"]`: The output text is converted to speech by the TTS module and then published to the RTC channel. - - `["audio"]`: Voice only. Voice is published directly to the RTC channel. - - `["text", "audio"]`: Text plus voice. Write your own logic to process the output of LLM as needed. - """ - - greeting_message: typing.Optional[str] = pydantic.Field(default=None) - """ - Agent greeting. If provided, the first user in the channel is automatically greeted with the message upon joining. - """ - - failure_message: typing.Optional[str] = pydantic.Field(default=None) - """ - Prompt for agent activation failure. If provided, it is returned through TTS when the custom LLM call fails. - """ - - vendor: typing.Optional[str] = pydantic.Field(default=None) - """ - LLM provider, supports the following settings: - - `custom`: Custom LLM. When you set this option, the agent includes the following fields, in addition to `role` and `content` when making requests to the custom LLM: - - `turn_id`: A unique identifier for each conversation turn. It starts from `0` and increments with each turn. One user-agent interaction corresponds to one `turn_id`. - - `timestamp`: The request timestamp, in milliseconds. - - `azure`: Use this value for Azure OpenAI - """ - - style: typing.Optional[StartAgentsRequestPropertiesLlmStyle] = pydantic.Field(default=None) - """ - The request style for chat completion: - - `openai`: For OpenAI and OpenAI-compatible APIs - - `gemini`: For Google Gemini and Google Vertex API format - - `anthropic`: For Anthropic Claude API format - - `dify`: For Dify API format - """ - - greeting_configs: typing.Optional[StartAgentsRequestPropertiesLlmGreetingConfigs] = pydantic.Field(default=None) - """ - Agent greeting broadcast configuration. - """ - - template_variables: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) - """ - Template parameter configuration used to insert variables into the agent's `system_messages`, `greeting_message`, `failure_message`, and `parameters.silence_config.content` text. Uses key-value pairs, where the key is the variable name and the value is the variable's value. To insert defined variables in the prompt text, use the syntax `{{variable_name}}`. The system automatically replaces each variable with the corresponding value defined in `template_variables`. Variable values cannot reference other variables. - """ - - mcp_servers: typing.Optional[typing.List[StartAgentsRequestPropertiesLlmMcpServersItem]] = pydantic.Field( - default=None - ) - """ - MCP (Model Context Protocol) server configuration. By configuring MCP servers, agents can call tools provided by external services to implement advanced functionality. - """ - - headers: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) - """ - Custom headers to include in requests to the LLM. Use this field to pass business-specific information such as custom fields or tenant identifiers. These headers are merged with the headers generated by the Conversational AI Engine. If a key conflict occurs, the engine-generated header takes precedence. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py b/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py deleted file mode 100644 index c0d7046..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py +++ /dev/null @@ -1,43 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_llm_greeting_configs_mode import ( - StartAgentsRequestPropertiesLlmGreetingConfigsMode, -) - - -class StartAgentsRequestPropertiesLlmGreetingConfigs(UncheckedBaseModel): - """ - Agent greeting broadcast configuration. - """ - - mode: typing.Optional[StartAgentsRequestPropertiesLlmGreetingConfigsMode] = pydantic.Field(default=None) - """ - Determines when the agent sends greeting messages to users joining the channel. - - `single_every`: Broadcasts a greeting every time a user joins the channel. - - `single_first`: Broadcasts a greeting only once to the first user who joins the channel. - """ - - delay_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - The delay in milliseconds before the agent plays the greeting message after a user joins the channel. - """ - - interruptable: typing.Optional[bool] = pydantic.Field(default=None) - """ - - `true`: Follows the global `interruption` configuration. - - `false`: Uninterruptible. The greeting plays in its entirety. If the user speaks multiple times while the greeting plays, the system merges the speech segments after the greeting ends and sends them to the LLM for a single response. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs_mode.py b/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs_mode.py deleted file mode 100644 index 44e4a55..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs_mode.py +++ /dev/null @@ -1,7 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesLlmGreetingConfigsMode = typing.Union[ - typing.Literal["single_every", "single_first"], typing.Any -] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm_mcp_servers_item.py b/src/agora_agent/agents/types/start_agents_request_properties_llm_mcp_servers_item.py deleted file mode 100644 index 0474072..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm_mcp_servers_item.py +++ /dev/null @@ -1,54 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel - - -class StartAgentsRequestPropertiesLlmMcpServersItem(UncheckedBaseModel): - name: str = pydantic.Field() - """ - A unique identifier for the MCP server. Maximum 48 characters. Accepts only English letters and numbers. - """ - - endpoint: str = pydantic.Field() - """ - The endpoint address of the MCP server. The agent uses this to communicate with the MCP server. - """ - - transport: typing.Optional[typing.Literal["streamable_http"]] = pydantic.Field(default=None) - """ - Transport protocol type. - - `streamable_http`: Streaming HTTP protocol - """ - - headers: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) - """ - HTTP header information to include when requesting the MCP server, such as authentication information. - """ - - allowed_tools: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - A list of tools that the agent is allowed to invoke. The agent can only use tools on this list. - - Empty or omitted: All tools are enabled. - - Empty array `[]`: No tools are enabled. - - `["*"]`: All tools are enabled. - - Specific tools `["aa", "bb"]`: Only listed tools are enabled. - - Mix with wildcard `["aa", "*"]`: All tools are enabled (wildcard takes precedence). - """ - - timeout_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - The MCP server request timeout in milliseconds. After timeout, the agent stops waiting for the MCP server's response and continues executing subsequent logic. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm_style.py b/src/agora_agent/agents/types/start_agents_request_properties_llm_style.py deleted file mode 100644 index eaa9a0d..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm_style.py +++ /dev/null @@ -1,5 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesLlmStyle = typing.Union[typing.Literal["openai", "gemini", "anthropic", "dify"], typing.Any] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm.py deleted file mode 100644 index 0993ebc..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm.py +++ /dev/null @@ -1,86 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection -from .start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor - - -class StartAgentsRequestPropertiesMllm(UncheckedBaseModel): - """ - Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing. `mllm` is an exclusive alternative to the standard `asr` + `llm` + `tts` pipeline. - """ - - enable: typing.Optional[bool] = pydantic.Field(default=None) - """ - Enable Multimodal Large Language Model for voice-to-voice processing. Enabling MLLM automatically disables ASR, LLM, and TTS since the MLLM handles end-to-end voice processing directly. Replaces the deprecated `advanced_features.enable_mllm`. - """ - - url: typing.Optional[str] = pydantic.Field(default=None) - """ - The MLLM WebSocket URL for real-time communication. - """ - - api_key: typing.Optional[str] = pydantic.Field(default=None) - """ - The API key used for MLLM authentication. - """ - - messages: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) - """ - Array of conversation items used for short-term memory management. Uses the same structure as `item.content` from the OpenAI Realtime API. - """ - - params: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) - """ - Additional MLLM configuration parameters. The `modalities` setting is overridden by `input_modalities` and `output_modalities`. The `turn_detection` setting is overridden by `mllm.turn_detection`. - """ - - input_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - MLLM input modalities: - - `["audio"]`: Audio only - - `["audio", "text"]`: Audio plus text - """ - - output_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - MLLM output modalities: - - `["text", "audio"]`: Text plus audio - """ - - greeting_message: typing.Optional[str] = pydantic.Field(default=None) - """ - Agent greeting message. If provided, the first user in the channel is automatically greeted with this message upon joining. - """ - - failure_message: typing.Optional[str] = pydantic.Field(default=None) - """ - Agent failure message. If provided, the agent speaks this message when an MLLM request fails. - """ - - vendor: typing.Optional[StartAgentsRequestPropertiesMllmVendor] = pydantic.Field(default=None) - """ - MLLM provider. Currently supports: - - `openai`: OpenAI Realtime API - - `gemini`: Google Gemini Live - - `vertexai`: Google Gemini Live (Vertex AI) - - `xai`: xAI Grok Realtime API - """ - - turn_detection: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetection] = pydantic.Field(default=None) - """ - Turn detection configuration for the MLLM module. When defined, the top-level `turn_detection` object has no effect. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py deleted file mode 100644 index 032979d..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py +++ /dev/null @@ -1,61 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_mllm_turn_detection_agora_vad_config import ( - StartAgentsRequestPropertiesMllmTurnDetectionAgoraVadConfig, -) -from .start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode -from .start_agents_request_properties_mllm_turn_detection_semantic_vad_config import ( - StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfig, -) -from .start_agents_request_properties_mllm_turn_detection_server_vad_config import ( - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfig, -) - - -class StartAgentsRequestPropertiesMllmTurnDetection(UncheckedBaseModel): - """ - Turn detection configuration for the MLLM module. When defined, the top-level `turn_detection` object has no effect. - """ - - mode: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionMode] = pydantic.Field(default=None) - """ - Turn detection mode for MLLM: - - `agora_vad`: Agora VAD-based detection. - - `server_vad`: Vendor-side VAD-based detection. Supported by OpenAI Realtime API, Gemini Live, and xAI Grok. - - `semantic_vad`: Semantic-based detection. Supported by OpenAI Realtime API only. - """ - - agora_vad_config: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionAgoraVadConfig] = pydantic.Field( - default=None - ) - """ - Configuration for Agora VAD-based turn detection. Applicable when `mode` is `agora_vad`. - """ - - server_vad_config: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfig] = pydantic.Field( - default=None - ) - """ - Configuration for vendor-side VAD-based turn detection. Applicable when `mode` is `server_vad`. Parameters are passed through to the vendor. - """ - - semantic_vad_config: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfig] = ( - pydantic.Field(default=None) - ) - """ - Configuration for semantic-based turn detection. Applicable when `mode` is `semantic_vad`. Supported by OpenAI Realtime API only. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_agora_vad_config.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_agora_vad_config.py deleted file mode 100644 index ec30215..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_agora_vad_config.py +++ /dev/null @@ -1,42 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel - - -class StartAgentsRequestPropertiesMllmTurnDetectionAgoraVadConfig(UncheckedBaseModel): - """ - Configuration for Agora VAD-based turn detection. Applicable when `mode` is `agora_vad`. - """ - - interrupt_duration_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Minimum duration of speech in milliseconds required to trigger an interruption. - """ - - prefix_padding_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Duration of audio in milliseconds to include before the detected speech start. - """ - - silence_duration_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Duration of silence in milliseconds required to determine end of speech. - """ - - threshold: typing.Optional[float] = pydantic.Field(default=None) - """ - VAD sensitivity threshold. A higher value reduces false positives. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_mode.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_mode.py deleted file mode 100644 index 0d004e8..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_mode.py +++ /dev/null @@ -1,7 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesMllmTurnDetectionMode = typing.Union[ - typing.Literal["agora_vad", "server_vad", "semantic_vad"], typing.Any -] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config.py deleted file mode 100644 index 1e310f0..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config.py +++ /dev/null @@ -1,32 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness import ( - StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfigEagerness, -) - - -class StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfig(UncheckedBaseModel): - """ - Configuration for semantic-based turn detection. Applicable when `mode` is `semantic_vad`. Supported by OpenAI Realtime API only. - """ - - eagerness: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfigEagerness] = ( - pydantic.Field(default=None) - ) - """ - Controls how eagerly the model ends its turn. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness.py deleted file mode 100644 index 8b67b1d..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness.py +++ /dev/null @@ -1,7 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfigEagerness = typing.Union[ - typing.Literal["auto", "low", "medium", "high"], typing.Any -] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py deleted file mode 100644 index c74d8d7..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py +++ /dev/null @@ -1,62 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity import ( - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigEndOfSpeechSensitivity, -) -from .start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity import ( - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigStartOfSpeechSensitivity, -) - - -class StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfig(UncheckedBaseModel): - """ - Configuration for vendor-side VAD-based turn detection. Applicable when `mode` is `server_vad`. Parameters are passed through to the vendor. - """ - - prefix_padding_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Duration of audio in milliseconds to include before the detected speech start. - """ - - silence_duration_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Duration of silence in milliseconds required to determine end of speech. - """ - - threshold: typing.Optional[float] = pydantic.Field(default=None) - """ - VAD sensitivity threshold. Applicable to OpenAI Realtime API and xAI Grok. - """ - - idle_timeout_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Idle timeout in milliseconds. Applicable to OpenAI Realtime API only. - """ - - start_of_speech_sensitivity: typing.Optional[ - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigStartOfSpeechSensitivity - ] = pydantic.Field(default=None) - """ - Sensitivity for start of speech detection. Applicable to Gemini Live only. - """ - - end_of_speech_sensitivity: typing.Optional[ - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigEndOfSpeechSensitivity - ] = pydantic.Field(default=None) - """ - Sensitivity for end of speech detection. Applicable to Gemini Live only. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py deleted file mode 100644 index 0233696..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py +++ /dev/null @@ -1,5 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesMllmVendor = typing.Union[typing.Literal["openai", "gemini", "vertexai", "xai"], typing.Any] diff --git a/src/agora_agent/core/client_wrapper.py b/src/agora_agent/core/client_wrapper.py index c44e886..acd9073 100644 --- a/src/agora_agent/core/client_wrapper.py +++ b/src/agora_agent/core/client_wrapper.py @@ -26,10 +26,10 @@ def __init__( def get_headers(self) -> typing.Dict[str, str]: headers: typing.Dict[str, str] = { - "User-Agent": "agora-agents/v2.0.0", + "User-Agent": "agora-agents/v2.1.0", "X-Fern-Language": "Python", "X-Fern-SDK-Name": "agora-agents", - "X-Fern-SDK-Version": "v2.0.0", + "X-Fern-SDK-Version": "v2.1.0", **(self.get_custom_headers() or {}), } headers["Authorization"] = httpx.BasicAuth(self._get_username(), self._get_password())._auth_header diff --git a/src/agora_agent/types/amazon_asr.py b/src/agora_agent/types/amazon_asr.py new file mode 100644 index 0000000..4054518 --- /dev/null +++ b/src/agora_agent/types/amazon_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .amazon_asr_params import AmazonAsrParams +from .asr_language import AsrLanguage + + +class AmazonAsr(UncheckedBaseModel): + """ + Amazon Transcribe ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: AmazonAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/amazon_asr_params.py b/src/agora_agent/types/amazon_asr_params.py new file mode 100644 index 0000000..1d30688 --- /dev/null +++ b/src/agora_agent/types/amazon_asr_params.py @@ -0,0 +1,52 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class AmazonAsrParams(UncheckedBaseModel): + """ + Amazon Transcribe ASR configuration parameters. + """ + + region: str = pydantic.Field() + """ + AWS region + """ + + access_key_id: str = pydantic.Field() + """ + AWS access key ID + """ + + secret_access_key: str = pydantic.Field() + """ + AWS secret access key + """ + + language_code: str = pydantic.Field() + """ + Language code for speech recognition + """ + + media_sample_rate_hz: typing.Optional[int] = pydantic.Field(default=None) + """ + Sample rate in Hertz for the audio input + """ + + media_encoding: typing.Optional[str] = pydantic.Field(default=None) + """ + Encoding format of the audio input + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/amazon_tts_params.py b/src/agora_agent/types/amazon_tts_params.py index baaa6fa..bbecb36 100644 --- a/src/agora_agent/types/amazon_tts_params.py +++ b/src/agora_agent/types/amazon_tts_params.py @@ -5,6 +5,7 @@ import pydantic from ..core.pydantic_utilities import IS_PYDANTIC_V2 from ..core.unchecked_base_model import UncheckedBaseModel +from .amazon_tts_params_engine import AmazonTtsParamsEngine class AmazonTtsParams(UncheckedBaseModel): @@ -12,26 +13,31 @@ class AmazonTtsParams(UncheckedBaseModel): Amazon Polly TTS configuration parameters. """ - access_key: str = pydantic.Field() + aws_access_key_id: str = pydantic.Field() """ - AWS access key + AWS access key ID """ - secret_key: str = pydantic.Field() + aws_secret_access_key: str = pydantic.Field() """ AWS secret key """ - region: str = pydantic.Field() + region_name: str = pydantic.Field() """ AWS region (e.g., "us-east-1") """ - voice_id: str = pydantic.Field() + voice: str = pydantic.Field() """ Amazon Polly voice ID """ + engine: AmazonTtsParamsEngine = pydantic.Field() + """ + Amazon Polly engine type + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/amazon_tts_params_engine.py b/src/agora_agent/types/amazon_tts_params_engine.py new file mode 100644 index 0000000..d9e3cfe --- /dev/null +++ b/src/agora_agent/types/amazon_tts_params_engine.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +AmazonTtsParamsEngine = typing.Union[typing.Literal["standard", "neural", "long-form", "generative"], typing.Any] diff --git a/src/agora_agent/types/ares_asr.py b/src/agora_agent/types/ares_asr.py new file mode 100644 index 0000000..cf42216 --- /dev/null +++ b/src/agora_agent/types/ares_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .ares_asr_params import AresAsrParams +from .asr_language import AsrLanguage + + +class AresAsr(UncheckedBaseModel): + """ + Adaptive Recognition Engine for Speech ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: typing.Optional[AresAsrParams] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/ares_asr_params.py b/src/agora_agent/types/ares_asr_params.py new file mode 100644 index 0000000..afa1d76 --- /dev/null +++ b/src/agora_agent/types/ares_asr_params.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +AresAsrParams = typing.Dict[str, typing.Any] diff --git a/src/agora_agent/types/asr.py b/src/agora_agent/types/asr.py new file mode 100644 index 0000000..f08086f --- /dev/null +++ b/src/agora_agent/types/asr.py @@ -0,0 +1,172 @@ +# This file was auto-generated by Fern from our API Definition. + +from __future__ import annotations + +import typing + +import pydantic +import typing_extensions +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel, UnionMetadata +from .amazon_asr_params import AmazonAsrParams +from .ares_asr_params import AresAsrParams +from .asr_language import AsrLanguage +from .assembly_ai_asr_params import AssemblyAiAsrParams +from .deepgram_asr_params import DeepgramAsrParams +from .google_asr_params import GoogleAsrParams +from .microsoft_asr_params import MicrosoftAsrParams +from .open_ai_asr_params import OpenAiAsrParams +from .sarvam_asr_params import SarvamAsrParams +from .speechmatics_asr_params import SpeechmaticsAsrParams + + +class Asr_Ares(UncheckedBaseModel): + vendor: typing.Literal["ares"] = "ares" + language: typing.Optional[AsrLanguage] = None + params: typing.Optional[AresAsrParams] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Microsoft(UncheckedBaseModel): + vendor: typing.Literal["microsoft"] = "microsoft" + language: typing.Optional[AsrLanguage] = None + params: MicrosoftAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Deepgram(UncheckedBaseModel): + vendor: typing.Literal["deepgram"] = "deepgram" + language: typing.Optional[AsrLanguage] = None + params: DeepgramAsrParams + keyterm: typing.Optional[str] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Openai(UncheckedBaseModel): + vendor: typing.Literal["openai"] = "openai" + language: typing.Optional[AsrLanguage] = None + params: OpenAiAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Google(UncheckedBaseModel): + vendor: typing.Literal["google"] = "google" + language: typing.Optional[AsrLanguage] = None + params: GoogleAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Amazon(UncheckedBaseModel): + vendor: typing.Literal["amazon"] = "amazon" + language: typing.Optional[AsrLanguage] = None + params: AmazonAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Assemblyai(UncheckedBaseModel): + vendor: typing.Literal["assemblyai"] = "assemblyai" + language: typing.Optional[AsrLanguage] = None + params: AssemblyAiAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Speechmatics(UncheckedBaseModel): + vendor: typing.Literal["speechmatics"] = "speechmatics" + language: typing.Optional[AsrLanguage] = None + params: SpeechmaticsAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Sarvam(UncheckedBaseModel): + vendor: typing.Literal["sarvam"] = "sarvam" + language: typing.Optional[AsrLanguage] = None + params: SarvamAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +Asr = typing_extensions.Annotated[ + typing.Union[ + Asr_Ares, + Asr_Microsoft, + Asr_Deepgram, + Asr_Openai, + Asr_Google, + Asr_Amazon, + Asr_Assemblyai, + Asr_Speechmatics, + Asr_Sarvam, + ], + UnionMetadata(discriminant="vendor"), +] diff --git a/src/agora_agent/types/asr_language.py b/src/agora_agent/types/asr_language.py new file mode 100644 index 0000000..4ff3c88 --- /dev/null +++ b/src/agora_agent/types/asr_language.py @@ -0,0 +1,41 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +AsrLanguage = typing.Union[ + typing.Literal[ + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", + ], + typing.Any, +] diff --git a/src/agora_agent/types/assembly_ai_asr.py b/src/agora_agent/types/assembly_ai_asr.py new file mode 100644 index 0000000..ea2ebf4 --- /dev/null +++ b/src/agora_agent/types/assembly_ai_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .assembly_ai_asr_params import AssemblyAiAsrParams + + +class AssemblyAiAsr(UncheckedBaseModel): + """ + AssemblyAI ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: AssemblyAiAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/assembly_ai_asr_params.py b/src/agora_agent/types/assembly_ai_asr_params.py new file mode 100644 index 0000000..f3a5818 --- /dev/null +++ b/src/agora_agent/types/assembly_ai_asr_params.py @@ -0,0 +1,37 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class AssemblyAiAsrParams(UncheckedBaseModel): + """ + AssemblyAI ASR configuration parameters. + """ + + api_key: str = pydantic.Field() + """ + AssemblyAI API key + """ + + language: str = pydantic.Field() + """ + Language code for speech recognition + """ + + uri: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for AssemblyAI's streaming API + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/cartesia_tts_output_format.py b/src/agora_agent/types/cartesia_tts_output_format.py new file mode 100644 index 0000000..ab7e122 --- /dev/null +++ b/src/agora_agent/types/cartesia_tts_output_format.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class CartesiaTtsOutputFormat(UncheckedBaseModel): + """ + Cartesia audio output format configuration. + """ + + container: typing.Optional[str] = pydantic.Field(default=None) + """ + Audio container format for the output stream + """ + + sample_rate: typing.Optional[int] = pydantic.Field(default=None) + """ + Audio sampling rate in Hz + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/cartesia_tts_params.py b/src/agora_agent/types/cartesia_tts_params.py index 2aaf069..1478570 100644 --- a/src/agora_agent/types/cartesia_tts_params.py +++ b/src/agora_agent/types/cartesia_tts_params.py @@ -5,6 +5,7 @@ import pydantic from ..core.pydantic_utilities import IS_PYDANTIC_V2 from ..core.unchecked_base_model import UncheckedBaseModel +from .cartesia_tts_output_format import CartesiaTtsOutputFormat from .cartesia_tts_voice import CartesiaTtsVoice @@ -18,15 +19,21 @@ class CartesiaTtsParams(UncheckedBaseModel): Cartesia API key """ - voice: CartesiaTtsVoice - model_id: typing.Optional[str] = pydantic.Field(default=None) + model_id: str = pydantic.Field() """ - Model ID (optional) + Model ID (for example, sonic-2) """ - sample_rate: typing.Optional[int] = pydantic.Field(default=None) + base_url: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for the Cartesia streaming API + """ + + voice: CartesiaTtsVoice + output_format: typing.Optional[CartesiaTtsOutputFormat] = None + language: typing.Optional[str] = pydantic.Field(default=None) """ - Audio sampling rate in Hz + Target language for speech synthesis """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/types/deepgram_asr.py b/src/agora_agent/types/deepgram_asr.py new file mode 100644 index 0000000..1c79c7b --- /dev/null +++ b/src/agora_agent/types/deepgram_asr.py @@ -0,0 +1,31 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .deepgram_asr_params import DeepgramAsrParams + + +class DeepgramAsr(UncheckedBaseModel): + """ + Deepgram ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: DeepgramAsrParams + keyterm: typing.Optional[str] = pydantic.Field(default=None) + """ + Boost specialized terms and brands for preset-backed Deepgram usage. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/deepgram_asr_params.py b/src/agora_agent/types/deepgram_asr_params.py new file mode 100644 index 0000000..259958e --- /dev/null +++ b/src/agora_agent/types/deepgram_asr_params.py @@ -0,0 +1,47 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class DeepgramAsrParams(UncheckedBaseModel): + """ + Deepgram ASR configuration parameters. + """ + + url: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for Deepgram's streaming API + """ + + key: str = pydantic.Field() + """ + Deepgram API key + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + Speech recognition model + """ + + language: typing.Optional[str] = pydantic.Field(default=None) + """ + Language code for speech recognition + """ + + keyterm: typing.Optional[str] = pydantic.Field(default=None) + """ + Boost specialized terms and brands + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/eleven_labs_tts_params.py b/src/agora_agent/types/eleven_labs_tts_params.py index c6127fd..4a2bf8f 100644 --- a/src/agora_agent/types/eleven_labs_tts_params.py +++ b/src/agora_agent/types/eleven_labs_tts_params.py @@ -12,7 +12,7 @@ class ElevenLabsTtsParams(UncheckedBaseModel): ElevenLabs TTS configuration parameters. """ - base_url: typing.Optional[str] = pydantic.Field(default=None) + base_url: str = pydantic.Field() """ WebSocket URL (e.g., "wss://api.elevenlabs.io/v1") """ @@ -37,6 +37,31 @@ class ElevenLabsTtsParams(UncheckedBaseModel): Audio sample rate in Hz (16kHz for Akool, 24kHz for HeyGen) """ + speed: typing.Optional[float] = pydantic.Field(default=None) + """ + Speech speed multiplier. + """ + + stability: typing.Optional[float] = pydantic.Field(default=None) + """ + Voice stability. Higher values produce more consistent speech. + """ + + similarity_boost: typing.Optional[float] = pydantic.Field(default=None) + """ + Similarity boost for the selected voice. + """ + + style: typing.Optional[float] = pydantic.Field(default=None) + """ + Speaking style and expressiveness control. + """ + + use_speaker_boost: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to improve voice quality and similarity. + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/fish_audio_tts_params.py b/src/agora_agent/types/fish_audio_tts_params.py index 0ad77aa..9bb4ebb 100644 --- a/src/agora_agent/types/fish_audio_tts_params.py +++ b/src/agora_agent/types/fish_audio_tts_params.py @@ -12,7 +12,7 @@ class FishAudioTtsParams(UncheckedBaseModel): Fish Audio TTS configuration parameters. """ - key: str = pydantic.Field() + api_key: str = pydantic.Field() """ Fish Audio API key """ @@ -22,6 +22,11 @@ class FishAudioTtsParams(UncheckedBaseModel): Fish Audio reference ID """ + backend: str = pydantic.Field() + """ + Backend model version to use + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/google_asr.py b/src/agora_agent/types/google_asr.py new file mode 100644 index 0000000..8473a04 --- /dev/null +++ b/src/agora_agent/types/google_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .google_asr_params import GoogleAsrParams + + +class GoogleAsr(UncheckedBaseModel): + """ + Google ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: GoogleAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/google_asr_params.py b/src/agora_agent/types/google_asr_params.py new file mode 100644 index 0000000..9d17db6 --- /dev/null +++ b/src/agora_agent/types/google_asr_params.py @@ -0,0 +1,47 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class GoogleAsrParams(UncheckedBaseModel): + """ + Google ASR configuration parameters. + """ + + project_id: str = pydantic.Field() + """ + Google Cloud project ID + """ + + location: str = pydantic.Field() + """ + Google Cloud region for the speech service + """ + + adc_credentials_string: str = pydantic.Field() + """ + Google Cloud service account credentials JSON string + """ + + language: str = pydantic.Field() + """ + Language code for speech recognition + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + Recognition model to use + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/google_tts_audio_config.py b/src/agora_agent/types/google_tts_audio_config.py new file mode 100644 index 0000000..9c2a405 --- /dev/null +++ b/src/agora_agent/types/google_tts_audio_config.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class GoogleTtsAudioConfig(UncheckedBaseModel): + """ + Google audio output configuration. + """ + + speaking_rate: typing.Optional[float] = pydantic.Field(default=None) + """ + Speed of speech + """ + + sample_rate_hertz: typing.Optional[int] = pydantic.Field(default=None) + """ + Sample rate in Hz + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/google_tts_params.py b/src/agora_agent/types/google_tts_params.py index dc00322..4a9ee38 100644 --- a/src/agora_agent/types/google_tts_params.py +++ b/src/agora_agent/types/google_tts_params.py @@ -3,8 +3,12 @@ import typing import pydantic +import typing_extensions from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.serialization import FieldMetadata from ..core.unchecked_base_model import UncheckedBaseModel +from .google_tts_audio_config import GoogleTtsAudioConfig +from .google_tts_voice_selection_params import GoogleTtsVoiceSelectionParams class GoogleTtsParams(UncheckedBaseModel): @@ -12,25 +16,17 @@ class GoogleTtsParams(UncheckedBaseModel): Google TTS configuration parameters. """ - key: str = pydantic.Field() + credentials: str = pydantic.Field() """ - Google Cloud API key + Google Cloud service account credentials JSON string """ - voice_name: str = pydantic.Field() - """ - Google voice name - """ - - language_code: typing.Optional[str] = pydantic.Field(default=None) - """ - Language code (e.g., "en-US") - """ - - sample_rate_hertz: typing.Optional[int] = pydantic.Field(default=None) - """ - Sample rate in Hz (default depends on selected voice) - """ + voice_selection_params: typing_extensions.Annotated[ + GoogleTtsVoiceSelectionParams, FieldMetadata(alias="VoiceSelectionParams") + ] + audio_config: typing_extensions.Annotated[ + typing.Optional[GoogleTtsAudioConfig], FieldMetadata(alias="AudioConfig") + ] = None if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 diff --git a/src/agora_agent/types/google_tts_voice_selection_params.py b/src/agora_agent/types/google_tts_voice_selection_params.py new file mode 100644 index 0000000..ee75953 --- /dev/null +++ b/src/agora_agent/types/google_tts_voice_selection_params.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class GoogleTtsVoiceSelectionParams(UncheckedBaseModel): + """ + Google voice selection parameters. + """ + + name: str = pydantic.Field() + """ + Google voice name + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/hume_ai_tts_params.py b/src/agora_agent/types/hume_ai_tts_params.py index 08cb12b..00c9f54 100644 --- a/src/agora_agent/types/hume_ai_tts_params.py +++ b/src/agora_agent/types/hume_ai_tts_params.py @@ -5,6 +5,7 @@ import pydantic from ..core.pydantic_utilities import IS_PYDANTIC_V2 from ..core.unchecked_base_model import UncheckedBaseModel +from .hume_ai_tts_params_provider import HumeAiTtsParamsProvider class HumeAiTtsParams(UncheckedBaseModel): @@ -17,9 +18,34 @@ class HumeAiTtsParams(UncheckedBaseModel): Hume AI API key """ + voice_id: str = pydantic.Field() + """ + Hume AI voice ID + """ + + base_url: typing.Optional[str] = pydantic.Field(default=None) + """ + Base URL for the Hume AI API + """ + + provider: HumeAiTtsParamsProvider = pydantic.Field() + """ + Voice provider type + """ + + speed: typing.Optional[float] = pydantic.Field(default=None) + """ + Playback speed of the generated speech + """ + + trailing_silence: typing.Optional[float] = pydantic.Field(default=None) + """ + Duration of silence in seconds to add at the end of each utterance + """ + config_id: typing.Optional[str] = pydantic.Field(default=None) """ - Hume AI configuration ID + Hume AI configuration ID. Deprecated; use voice_id for the documented TTS shape. """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/types/hume_ai_tts_params_provider.py b/src/agora_agent/types/hume_ai_tts_params_provider.py new file mode 100644 index 0000000..cf07e73 --- /dev/null +++ b/src/agora_agent/types/hume_ai_tts_params_provider.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +HumeAiTtsParamsProvider = typing.Union[typing.Literal["HUME_AI", "CUSTOM_VOICE"], typing.Any] diff --git a/src/agora_agent/types/llm.py b/src/agora_agent/types/llm.py new file mode 100644 index 0000000..2b0283d --- /dev/null +++ b/src/agora_agent/types/llm.py @@ -0,0 +1,120 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .llm_params import LlmParams +from .llm_style import LlmStyle + + +class Llm(UncheckedBaseModel): + """ + Large language model (LLM) configuration. + """ + + url: typing.Optional[str] = pydantic.Field(default=None) + """ + The LLM callback address. + """ + + api_key: typing.Optional[str] = pydantic.Field(default=None) + """ + The LLM verification API key. + """ + + access_key: typing.Optional[str] = pydantic.Field(default=None) + """ + AWS access key ID. Used by Amazon Bedrock when api_key is not provided. + """ + + secret_key: typing.Optional[str] = pydantic.Field(default=None) + """ + AWS secret access key. Used by Amazon Bedrock when api_key is not provided. + """ + + region: typing.Optional[str] = pydantic.Field(default=None) + """ + AWS region. Used by Amazon Bedrock. + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + Top-level model identifier. Used by Amazon Bedrock. + """ + + system_messages: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) + """ + A set of predefined information used as input to the LLM. + """ + + params: typing.Optional[LlmParams] = None + max_history: typing.Optional[int] = pydantic.Field(default=None) + """ + The number of conversation history messages cached in the custom LLM. + """ + + input_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + LLM input modalities. + """ + + output_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + LLM output modalities. + """ + + greeting_message: typing.Optional[str] = pydantic.Field(default=None) + """ + Agent greeting. + """ + + failure_message: typing.Optional[str] = pydantic.Field(default=None) + """ + Prompt for agent activation failure. + """ + + vendor: typing.Optional[str] = pydantic.Field(default=None) + """ + LLM provider identifier. + """ + + style: typing.Optional[LlmStyle] = pydantic.Field(default=None) + """ + The request style for chat completion. + """ + + ignore_empty: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to handle empty Gemini responses. + """ + + greeting_configs: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) + """ + Agent greeting broadcast configuration. + """ + + template_variables: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) + """ + Template parameter configuration. + """ + + mcp_servers: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) + """ + MCP server configuration. + """ + + headers: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) + """ + Custom headers to include in requests to the LLM. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/llm_params.py b/src/agora_agent/types/llm_params.py new file mode 100644 index 0000000..f6df01f --- /dev/null +++ b/src/agora_agent/types/llm_params.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class LlmParams(UncheckedBaseModel): + """ + Additional LLM configuration parameters. + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + The LLM model identifier. + """ + + max_tokens: typing.Optional[int] = pydantic.Field(default=None) + """ + Maximum tokens in the response. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/llm_style.py b/src/agora_agent/types/llm_style.py new file mode 100644 index 0000000..8319ca1 --- /dev/null +++ b/src/agora_agent/types/llm_style.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +LlmStyle = typing.Union[typing.Literal["openai", "gemini", "anthropic", "dify", "bedrock"], typing.Any] diff --git a/src/agora_agent/types/microsoft_asr.py b/src/agora_agent/types/microsoft_asr.py new file mode 100644 index 0000000..f602e09 --- /dev/null +++ b/src/agora_agent/types/microsoft_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .microsoft_asr_params import MicrosoftAsrParams + + +class MicrosoftAsr(UncheckedBaseModel): + """ + Microsoft Azure ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: MicrosoftAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/microsoft_asr_params.py b/src/agora_agent/types/microsoft_asr_params.py new file mode 100644 index 0000000..bea79e4 --- /dev/null +++ b/src/agora_agent/types/microsoft_asr_params.py @@ -0,0 +1,42 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class MicrosoftAsrParams(UncheckedBaseModel): + """ + Microsoft Azure ASR configuration parameters. + """ + + key: str = pydantic.Field() + """ + Microsoft Azure API key + """ + + region: str = pydantic.Field() + """ + Azure region + """ + + language: str = pydantic.Field() + """ + Language code for speech recognition + """ + + phrase_list: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + Words or phrases to improve recognition accuracy + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/microsoft_tts_params.py b/src/agora_agent/types/microsoft_tts_params.py index 3c9e80c..12f441e 100644 --- a/src/agora_agent/types/microsoft_tts_params.py +++ b/src/agora_agent/types/microsoft_tts_params.py @@ -32,6 +32,16 @@ class MicrosoftTtsParams(UncheckedBaseModel): Audio sampling rate in Hz """ + speed: typing.Optional[float] = pydantic.Field(default=None) + """ + Speaking rate multiplier. Values between 0.5 and 2.0. + """ + + volume: typing.Optional[float] = pydantic.Field(default=None) + """ + Audio volume. Values between 0.0 and 100.0. + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/mllm.py b/src/agora_agent/types/mllm.py new file mode 100644 index 0000000..3bcdb95 --- /dev/null +++ b/src/agora_agent/types/mllm.py @@ -0,0 +1,88 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_params import MllmParams +from .mllm_turn_detection import MllmTurnDetection +from .mllm_vendor import MllmVendor + + +class Mllm(UncheckedBaseModel): + """ + Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing. + """ + + enable: typing.Optional[bool] = pydantic.Field(default=None) + """ + Enable Multimodal Large Language Model. + """ + + url: typing.Optional[str] = pydantic.Field(default=None) + """ + The MLLM WebSocket URL for real-time communication. + """ + + api_key: typing.Optional[str] = pydantic.Field(default=None) + """ + The API key used for MLLM authentication. + """ + + adc_credentials_string: typing.Optional[str] = pydantic.Field(default=None) + """ + Base64-encoded Google Cloud Application Default Credentials. Used by Vertex AI. + """ + + project_id: typing.Optional[str] = pydantic.Field(default=None) + """ + Google Cloud project ID. Used by Vertex AI. + """ + + location: typing.Optional[str] = pydantic.Field(default=None) + """ + Google Cloud location or region. Used by Vertex AI. + """ + + messages: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) + """ + Array of conversation items used for short-term memory management. + """ + + params: typing.Optional[MllmParams] = None + input_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + MLLM input modalities. + """ + + output_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + MLLM output modalities. + """ + + greeting_message: typing.Optional[str] = pydantic.Field(default=None) + """ + Agent greeting message. + """ + + failure_message: typing.Optional[str] = pydantic.Field(default=None) + """ + Agent failure message. + """ + + vendor: typing.Optional[MllmVendor] = pydantic.Field(default=None) + """ + MLLM provider. + """ + + turn_detection: typing.Optional[MllmTurnDetection] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_http_options.py b/src/agora_agent/types/mllm_http_options.py new file mode 100644 index 0000000..19baebb --- /dev/null +++ b/src/agora_agent/types/mllm_http_options.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class MllmHttpOptions(UncheckedBaseModel): + """ + HTTP request options for the MLLM provider. + """ + + api_version: typing.Optional[str] = pydantic.Field(default=None) + """ + API version to use. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_input_audio_transcription.py b/src/agora_agent/types/mllm_input_audio_transcription.py new file mode 100644 index 0000000..6bb3d9d --- /dev/null +++ b/src/agora_agent/types/mllm_input_audio_transcription.py @@ -0,0 +1,37 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class MllmInputAudioTranscription(UncheckedBaseModel): + """ + Configuration for audio input transcription. + """ + + language: typing.Optional[str] = pydantic.Field(default=None) + """ + Language of the input audio. + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + Model to use for transcription. + """ + + prompt: typing.Optional[str] = pydantic.Field(default=None) + """ + Text to guide the transcription model. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_params.py b/src/agora_agent/types/mllm_params.py new file mode 100644 index 0000000..5437b69 --- /dev/null +++ b/src/agora_agent/types/mllm_params.py @@ -0,0 +1,71 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_http_options import MllmHttpOptions +from .mllm_input_audio_transcription import MllmInputAudioTranscription + + +class MllmParams(UncheckedBaseModel): + """ + Additional MLLM configuration parameters. + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + The MLLM model identifier. + """ + + voice: typing.Optional[str] = pydantic.Field(default=None) + """ + Voice identifier for audio output. + """ + + instructions: typing.Optional[str] = pydantic.Field(default=None) + """ + System instructions that define the agent behavior or tone. + """ + + input_audio_transcription: typing.Optional[MllmInputAudioTranscription] = None + affective_dialog: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to enable Gemini affective dialog. + """ + + proactive_audio: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether Gemini may choose not to respond when no reply is needed. + """ + + transcribe_agent: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to transcribe the agent speech in real time. + """ + + transcribe_user: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to transcribe the user speech in real time. + """ + + http_options: typing.Optional[MllmHttpOptions] = None + language: typing.Optional[str] = pydantic.Field(default=None) + """ + Language code for xAI Grok speech recognition and synthesis. + """ + + sample_rate: typing.Optional[int] = pydantic.Field(default=None) + """ + Audio sample rate in Hz. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_turn_detection.py b/src/agora_agent/types/mllm_turn_detection.py new file mode 100644 index 0000000..2cd3503 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection.py @@ -0,0 +1,35 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_turn_detection_agora_vad_config import MllmTurnDetectionAgoraVadConfig +from .mllm_turn_detection_mode import MllmTurnDetectionMode +from .mllm_turn_detection_semantic_vad_config import MllmTurnDetectionSemanticVadConfig +from .mllm_turn_detection_server_vad_config import MllmTurnDetectionServerVadConfig + + +class MllmTurnDetection(UncheckedBaseModel): + """ + Turn detection configuration for the MLLM module. + """ + + mode: typing.Optional[MllmTurnDetectionMode] = pydantic.Field(default=None) + """ + Turn detection mode for MLLM. + """ + + agora_vad_config: typing.Optional[MllmTurnDetectionAgoraVadConfig] = None + server_vad_config: typing.Optional[MllmTurnDetectionServerVadConfig] = None + semantic_vad_config: typing.Optional[MllmTurnDetectionSemanticVadConfig] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_turn_detection_agora_vad_config.py b/src/agora_agent/types/mllm_turn_detection_agora_vad_config.py new file mode 100644 index 0000000..4168ef3 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_agora_vad_config.py @@ -0,0 +1,23 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class MllmTurnDetectionAgoraVadConfig(UncheckedBaseModel): + interrupt_duration_ms: typing.Optional[int] = None + prefix_padding_ms: typing.Optional[int] = None + silence_duration_ms: typing.Optional[int] = None + threshold: typing.Optional[float] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_turn_detection_mode.py b/src/agora_agent/types/mllm_turn_detection_mode.py new file mode 100644 index 0000000..f6cd693 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_mode.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +MllmTurnDetectionMode = typing.Union[typing.Literal["agora_vad", "server_vad", "semantic_vad"], typing.Any] diff --git a/src/agora_agent/types/mllm_turn_detection_semantic_vad_config.py b/src/agora_agent/types/mllm_turn_detection_semantic_vad_config.py new file mode 100644 index 0000000..aeaf440 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_semantic_vad_config.py @@ -0,0 +1,21 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_turn_detection_semantic_vad_config_eagerness import MllmTurnDetectionSemanticVadConfigEagerness + + +class MllmTurnDetectionSemanticVadConfig(UncheckedBaseModel): + eagerness: typing.Optional[MllmTurnDetectionSemanticVadConfigEagerness] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_turn_detection_semantic_vad_config_eagerness.py b/src/agora_agent/types/mllm_turn_detection_semantic_vad_config_eagerness.py new file mode 100644 index 0000000..dbf9b4d --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_semantic_vad_config_eagerness.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +MllmTurnDetectionSemanticVadConfigEagerness = typing.Union[typing.Literal["auto", "low", "medium", "high"], typing.Any] diff --git a/src/agora_agent/types/mllm_turn_detection_server_vad_config.py b/src/agora_agent/types/mllm_turn_detection_server_vad_config.py new file mode 100644 index 0000000..b2976b3 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_server_vad_config.py @@ -0,0 +1,31 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_turn_detection_server_vad_config_end_of_speech_sensitivity import ( + MllmTurnDetectionServerVadConfigEndOfSpeechSensitivity, +) +from .mllm_turn_detection_server_vad_config_start_of_speech_sensitivity import ( + MllmTurnDetectionServerVadConfigStartOfSpeechSensitivity, +) + + +class MllmTurnDetectionServerVadConfig(UncheckedBaseModel): + prefix_padding_ms: typing.Optional[int] = None + silence_duration_ms: typing.Optional[int] = None + threshold: typing.Optional[float] = None + idle_timeout_ms: typing.Optional[int] = None + start_of_speech_sensitivity: typing.Optional[MllmTurnDetectionServerVadConfigStartOfSpeechSensitivity] = None + end_of_speech_sensitivity: typing.Optional[MllmTurnDetectionServerVadConfigEndOfSpeechSensitivity] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py b/src/agora_agent/types/mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py similarity index 61% rename from src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py rename to src/agora_agent/types/mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py index e92d3f1..b9b3377 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py +++ b/src/agora_agent/types/mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py @@ -2,6 +2,6 @@ import typing -StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigEndOfSpeechSensitivity = typing.Union[ +MllmTurnDetectionServerVadConfigEndOfSpeechSensitivity = typing.Union[ typing.Literal["END_SENSITIVITY_HIGH", "END_SENSITIVITY_LOW"], typing.Any ] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py b/src/agora_agent/types/mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py similarity index 61% rename from src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py rename to src/agora_agent/types/mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py index 25860c1..90ccf51 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py +++ b/src/agora_agent/types/mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py @@ -2,6 +2,6 @@ import typing -StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigStartOfSpeechSensitivity = typing.Union[ +MllmTurnDetectionServerVadConfigStartOfSpeechSensitivity = typing.Union[ typing.Literal["START_SENSITIVITY_HIGH", "START_SENSITIVITY_LOW"], typing.Any ] diff --git a/src/agora_agent/types/mllm_vendor.py b/src/agora_agent/types/mllm_vendor.py new file mode 100644 index 0000000..61c4d1a --- /dev/null +++ b/src/agora_agent/types/mllm_vendor.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +MllmVendor = typing.Union[typing.Literal["openai", "gemini", "vertexai", "xai"], typing.Any] diff --git a/src/agora_agent/types/murf_tts_params.py b/src/agora_agent/types/murf_tts_params.py index 5107f62..78f78d8 100644 --- a/src/agora_agent/types/murf_tts_params.py +++ b/src/agora_agent/types/murf_tts_params.py @@ -3,7 +3,9 @@ import typing import pydantic +import typing_extensions from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.serialization import FieldMetadata from ..core.unchecked_base_model import UncheckedBaseModel @@ -12,19 +14,46 @@ class MurfTtsParams(UncheckedBaseModel): Murf TTS configuration parameters. """ - key: str = pydantic.Field() + api_key: str = pydantic.Field() """ Murf API key """ - voice_id: str = pydantic.Field() + base_url: typing.Optional[str] = pydantic.Field(default=None) """ - Voice ID (e.g., Ariana, Natalie, Ken) + WebSocket endpoint for streaming TTS output """ - style: typing.Optional[str] = pydantic.Field(default=None) + voice_id: typing_extensions.Annotated[typing.Optional[str], FieldMetadata(alias="voiceId")] = pydantic.Field( + default=None + ) """ - Voice style (e.g., Angry, Sad, Conversational, Newscast) + Voice ID (e.g., Matthew) + """ + + locale: typing.Optional[str] = pydantic.Field(default=None) + """ + Locale for the selected voice + """ + + rate: typing.Optional[float] = pydantic.Field(default=None) + """ + Speech rate adjustment + """ + + pitch: typing.Optional[float] = pydantic.Field(default=None) + """ + Pitch adjustment + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + TTS model to use + """ + + sample_rate: typing.Optional[float] = pydantic.Field(default=None) + """ + Audio sample rate in Hz """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/types/open_ai_asr.py b/src/agora_agent/types/open_ai_asr.py new file mode 100644 index 0000000..eec2aab --- /dev/null +++ b/src/agora_agent/types/open_ai_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .open_ai_asr_params import OpenAiAsrParams + + +class OpenAiAsr(UncheckedBaseModel): + """ + OpenAI ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: OpenAiAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/open_ai_asr_params.py b/src/agora_agent/types/open_ai_asr_params.py new file mode 100644 index 0000000..a5fadc8 --- /dev/null +++ b/src/agora_agent/types/open_ai_asr_params.py @@ -0,0 +1,30 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .open_ai_input_audio_transcription import OpenAiInputAudioTranscription + + +class OpenAiAsrParams(UncheckedBaseModel): + """ + OpenAI ASR configuration parameters. + """ + + api_key: str = pydantic.Field() + """ + OpenAI API key + """ + + input_audio_transcription: OpenAiInputAudioTranscription + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/open_ai_input_audio_transcription.py b/src/agora_agent/types/open_ai_input_audio_transcription.py new file mode 100644 index 0000000..9db45b1 --- /dev/null +++ b/src/agora_agent/types/open_ai_input_audio_transcription.py @@ -0,0 +1,37 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class OpenAiInputAudioTranscription(UncheckedBaseModel): + """ + OpenAI audio transcription configuration. + """ + + model: str = pydantic.Field() + """ + OpenAI ASR model to use for transcription + """ + + prompt: str = pydantic.Field() + """ + Prompt that guides the transcription process + """ + + language: str = pydantic.Field() + """ + Language code to use for transcription + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/open_ai_tts_params.py b/src/agora_agent/types/open_ai_tts_params.py index 3839646..c8f6e51 100644 --- a/src/agora_agent/types/open_ai_tts_params.py +++ b/src/agora_agent/types/open_ai_tts_params.py @@ -14,7 +14,12 @@ class OpenAiTtsParams(UncheckedBaseModel): api_key: typing.Optional[str] = pydantic.Field(default=None) """ - OpenAI API key. Optional for Agora-managed OpenAI TTS usage. + OpenAI API key. Optional for preset-backed OpenAI TTS usage. + """ + + base_url: typing.Optional[str] = pydantic.Field(default=None) + """ + Endpoint URL for the OpenAI TTS service. """ voice: str = pydantic.Field() @@ -27,6 +32,16 @@ class OpenAiTtsParams(UncheckedBaseModel): Model name (e.g., "tts-1", "tts-1-hd") """ + instructions: typing.Optional[str] = pydantic.Field(default=None) + """ + Custom instructions for voice style, accent, pace, and tone. + """ + + speed: typing.Optional[float] = pydantic.Field(default=None) + """ + Speaking rate multiplier. + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/rime_tts_params.py b/src/agora_agent/types/rime_tts_params.py index 6d18375..ade1c5b 100644 --- a/src/agora_agent/types/rime_tts_params.py +++ b/src/agora_agent/types/rime_tts_params.py @@ -3,7 +3,9 @@ import typing import pydantic +import typing_extensions from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.serialization import FieldMetadata from ..core.unchecked_base_model import UncheckedBaseModel @@ -12,7 +14,7 @@ class RimeTtsParams(UncheckedBaseModel): Rime TTS configuration parameters. """ - key: str = pydantic.Field() + api_key: str = pydantic.Field() """ Rime API key """ @@ -22,9 +24,14 @@ class RimeTtsParams(UncheckedBaseModel): Rime speaker ID """ - model_id: typing.Optional[str] = pydantic.Field(default=None) + model_id: typing_extensions.Annotated[str, FieldMetadata(alias="modelId")] = pydantic.Field() """ - Model ID (optional) + Rime TTS model ID + """ + + base_url: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for the Rime streaming API """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/types/sarvam_asr.py b/src/agora_agent/types/sarvam_asr.py new file mode 100644 index 0000000..ec95847 --- /dev/null +++ b/src/agora_agent/types/sarvam_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .sarvam_asr_params import SarvamAsrParams + + +class SarvamAsr(UncheckedBaseModel): + """ + Sarvam ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: SarvamAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/sarvam_asr_params.py b/src/agora_agent/types/sarvam_asr_params.py new file mode 100644 index 0000000..f29769d --- /dev/null +++ b/src/agora_agent/types/sarvam_asr_params.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class SarvamAsrParams(UncheckedBaseModel): + """ + Sarvam ASR configuration parameters. + """ + + api_key: str = pydantic.Field() + """ + Sarvam API key + """ + + language: str = pydantic.Field() + """ + Language code for transcription. Set to unknown for automatic language detection. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/sarvam_tts_params.py b/src/agora_agent/types/sarvam_tts_params.py index 93457a4..855299f 100644 --- a/src/agora_agent/types/sarvam_tts_params.py +++ b/src/agora_agent/types/sarvam_tts_params.py @@ -5,6 +5,7 @@ import pydantic from ..core.pydantic_utilities import IS_PYDANTIC_V2 from ..core.unchecked_base_model import UncheckedBaseModel +from .sarvam_tts_params_target_language_code import SarvamTtsParamsTargetLanguageCode class SarvamTtsParams(UncheckedBaseModel): @@ -12,7 +13,7 @@ class SarvamTtsParams(UncheckedBaseModel): Sarvam TTS configuration parameters. """ - key: str = pydantic.Field() + api_subscription_key: str = pydantic.Field() """ Sarvam API subscription key """ @@ -22,11 +23,31 @@ class SarvamTtsParams(UncheckedBaseModel): Voice ID (e.g., anushka, abhilash, karun, hitesh, manisha, vidya, arya) """ - target_language_code: str = pydantic.Field() + target_language_code: SarvamTtsParamsTargetLanguageCode = pydantic.Field() """ Target language code (e.g., en-IN) """ + pitch: typing.Optional[float] = pydantic.Field(default=None) + """ + Pitch adjustment for the voice + """ + + pace: typing.Optional[float] = pydantic.Field(default=None) + """ + Speed of speech + """ + + loudness: typing.Optional[float] = pydantic.Field(default=None) + """ + Volume level of the speech + """ + + sample_rate: typing.Optional[float] = pydantic.Field(default=None) + """ + Audio sample rate in Hz + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/sarvam_tts_params_target_language_code.py b/src/agora_agent/types/sarvam_tts_params_target_language_code.py new file mode 100644 index 0000000..b1722ec --- /dev/null +++ b/src/agora_agent/types/sarvam_tts_params_target_language_code.py @@ -0,0 +1,8 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +SarvamTtsParamsTargetLanguageCode = typing.Union[ + typing.Literal["en-IN", "hi-IN", "bn-IN", "ta-IN", "te-IN", "kn-IN", "ml-IN", "mr-IN", "gu-IN", "pa-IN", "or-IN"], + typing.Any, +] diff --git a/src/agora_agent/types/speechmatics_asr.py b/src/agora_agent/types/speechmatics_asr.py new file mode 100644 index 0000000..644db25 --- /dev/null +++ b/src/agora_agent/types/speechmatics_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .speechmatics_asr_params import SpeechmaticsAsrParams + + +class SpeechmaticsAsr(UncheckedBaseModel): + """ + Speechmatics ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: SpeechmaticsAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/speechmatics_asr_params.py b/src/agora_agent/types/speechmatics_asr_params.py new file mode 100644 index 0000000..4709d22 --- /dev/null +++ b/src/agora_agent/types/speechmatics_asr_params.py @@ -0,0 +1,37 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class SpeechmaticsAsrParams(UncheckedBaseModel): + """ + Speechmatics ASR configuration parameters. + """ + + api_key: str = pydantic.Field() + """ + Speechmatics API key + """ + + language: str = pydantic.Field() + """ + Language code to use for transcription + """ + + uri: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for the Speechmatics streaming API + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/tests/custom/test_avatar_token.py b/tests/custom/test_avatar_token.py deleted file mode 100644 index fa73fc0..0000000 --- a/tests/custom/test_avatar_token.py +++ /dev/null @@ -1,12 +0,0 @@ -from agora_agent.agentkit import generate_convo_ai_token - - -def test_avatar_tokens_use_convo_ai_token_path_with_avatar_uid(): - token = generate_convo_ai_token( - app_id="0" * 32, - app_certificate="1" * 32, - channel_name="room", - uid=123, - ) - - assert token.startswith("007") diff --git a/tests/custom/test_llm_vendors.py b/tests/custom/test_llm_vendors.py deleted file mode 100644 index faca9bf..0000000 --- a/tests/custom/test_llm_vendors.py +++ /dev/null @@ -1,60 +0,0 @@ -from agora_agent import AmazonBedrock, CustomLLM, Dify, Groq, VertexAILLM - - -def test_groq_serializes_as_openai_compatible() -> None: - config = Groq(api_key="groq-key", model="llama-3.3-70b-versatile").to_config() - - assert config["url"] == "https://api.groq.com/openai/v1/chat/completions" - assert config["api_key"] == "groq-key" - assert config["style"] == "openai" - assert config["params"]["model"] == "llama-3.3-70b-versatile" - - -def test_custom_llm_marks_request_as_custom() -> None: - config = CustomLLM(api_key="key", model="model", base_url="https://llm.example.com/chat").to_config() - - assert config["url"] == "https://llm.example.com/chat" - assert config["api_key"] == "key" - assert config["vendor"] == "custom" - assert config["style"] == "openai" - - -def test_vertex_ai_llm_includes_project_routing() -> None: - config = VertexAILLM( - api_key="vertex-token", - model="gemini-2.0-flash", - project_id="project", - location="us-central1", - ).to_config() - - assert config["api_key"] == "vertex-token" - assert config["style"] == "gemini" - assert config["params"]["model"] == "gemini-2.0-flash" - assert config["params"]["project_id"] == "project" - assert config["params"]["location"] == "us-central1" - - -def test_amazon_bedrock_serializes_as_anthropic_style() -> None: - config = AmazonBedrock( - api_key="bedrock-key", - url="https://bedrock.example.com/messages", - model="anthropic.claude-3-5-sonnet-20241022-v2:0", - ).to_config() - - assert config["api_key"] == "bedrock-key" - assert config["style"] == "anthropic" - assert config["params"]["model"] == "anthropic.claude-3-5-sonnet-20241022-v2:0" - - -def test_dify_serializes_conversation_fields() -> None: - config = Dify( - api_key="dify-key", - url="https://api.dify.ai/v1/chat-messages", - user="user-1", - conversation_id="conversation-1", - ).to_config() - - assert config["api_key"] == "dify-key" - assert config["style"] == "dify" - assert config["params"]["user"] == "user-1" - assert config["params"]["conversation_id"] == "conversation-1" diff --git a/tests/custom/test_root_exports.py b/tests/custom/test_root_exports.py deleted file mode 100644 index 9b2f508..0000000 --- a/tests/custom/test_root_exports.py +++ /dev/null @@ -1,29 +0,0 @@ -import pytest - -import agora_agent -import agora_agent.agentkit as agentkit - - -def test_root_exports_match_agentkit_for_common_symbols() -> None: - for name in ("Agent", "DeepgramSTT", "OpenAI", "AgentPresets", "generate_rtc_token", "DataChannel"): - assert getattr(agora_agent, name) is getattr(agentkit, name) - - -def test_root_exports_fern_client_symbols() -> None: - assert agora_agent.Agora is not None - assert agora_agent.Area is not None - assert agora_agent.AsyncAgora is not None - - -def test_unknown_root_export_raises_attribute_error() -> None: - with pytest.raises(AttributeError): - _ = agora_agent.NotARealExportName - - -def test_dir_includes_agentkit_vendor_exports() -> None: - assert "DeepgramSTT" in dir(agora_agent) - - -def test_all_includes_agentkit_vendor_exports() -> None: - assert "DeepgramSTT" in agora_agent.__all__ - assert "OpenAI" in agora_agent.__all__ From 3f7ba38a18604633251538458e9cf7eca4a8b754 Mon Sep 17 00:00:00 2001 From: fern-api <115122769+fern-api[bot]@users.noreply.github.com> Date: Tue, 2 Jun 2026 01:43:52 +0000 Subject: [PATCH 10/26] [fern-replay] Applied customizations Patches applied (5): - patch-64703bda: test(agentkit): add custom tests for v1.5.0 AgentKit behavior - patch-7c2d9d99: feat(agentkit): align session options and token uid handling - patch-7465fada: fix(agentkit): resolve Python session typing issues - patch-fae1249a: Re-export agora-agents API from legacy PyPI compatibility package The compat distribution delegates to agora_agent via __getattr__ and documents both import paths in its README. - patch-44c21c14: Re-export AgentKit symbols from agora_agent package root Extend __getattr__ and __all__ so vendor classes, presets, and helpers are importable via `from agora_agent import ...`. Add tests and update class docstring examples to use the root import path. Patches with unresolved conflicts (17): - patch-6e30398b: chore(agentkit): bump to v1.5.0 and expose v2.7 type aliases - patch-9df782b4: feat(agentkit): update MLLM and LLM vendor wrappers for v2.7 - patch-26706d73: feat(agentkit): add GenericAvatar and session-aware avatar validation - patch-9f491c63: feat(agentkit): update Agent builder and session lifecycle for v2.7 - patch-6c20f076: docs(agentkit): update v1.5.0 guides, reference, and changelog - patch-eaec58eb: refactor(agentkit): align deprecated vendor aliases with canonical names - patch-20245632: feat(agentkit): export type aliases and avatar token helpers - patch-972dd5bd: updated docs - patch-4323b470: rename python package to agora-agents - patch-d29165c4: make python compat package publishable - patch-fc9d93c3: Document agora-agents PyPI install name and migration notes - patch-87fc4488: Update docs to import from agora_agent package root - patch-923cf954: Prioritize app credentials and builder in Python docs Rewrite getting-started auth and quick-start for app credentials with the builder API. De-emphasize presets and align index, BYOK, and README with the recommended onboarding path. - patch-d475306b: Move package rename guidance to installation docs and protect manual paths in Fern ignore. Consolidate migration notes into the installation guide with next-step links, add a brief README pointer, and exclude README, compat, and workflow files from Fern generation. - patch-c9355576: Streamline Python docs and README for app-credentials-first onboarding. Remove duplicated low-level client examples from the README, de-emphasize legacy auth modes, refocus the low-level API guide on AgentKit with telephony escape hatches, and update Agora-managed model terminology. - patch-98ecb4d3: Add Groq, Vertex AI, Bedrock, Dify, and Custom LLM vendor helpers. Introduce named LLM vendor classes with correct request serialization, export them from the package root, and add tests covering each provider's config shape. - patch-a5097b8d: Document new LLM vendors and tighten onboarding docs. Add Groq, Vertex AI, Bedrock, Dify, and Custom LLM to vendor references, simplify README and index navigation, and align quick-start and terminology with Agora-managed model language. Run `fern-replay resolve` to apply these customizations. Patches absorbed by generator (3): - patch-b7f0c36c: feat(agentkit): release v2.0.0 updates - patch-4d32368c: Add compat-build CI job and harden dual-package PyPI publish Build and verify the compat wheel re-exports, gate publish on compat-build, simplify version checks with poetry version, wait for primary package on PyPI, and retry compat publish on failure. - patch-20109390: Fix PyPI publish auth and explicitly protect release workflow in Fern ignore. Use PYPI_API_TOKEN for primary and compat Poetry publishes, matching the v1.4.1 release flow, and list release.yml explicitly in .fernignore. The generator now produces these customizations natively. --- .fern/replay.lock | 12088 ++++++++++++++++++- src/agora_agent/agentkit/agent.py | 2 + src/agora_agent/agentkit/agent_session.py | 1 + src/agora_agent/agentkit/vendors/avatar.py | 43 + src/agora_agent/agentkit/vendors/mllm.py | 1 + tests/custom/test_agentkit_agent.py | 298 + tests/custom/test_agentkit_session.py | 383 + tests/custom/test_agentkit_vendors.py | 122 + tests/custom/test_avatar_token.py | 12 + tests/custom/test_llm_vendors.py | 60 + tests/custom/test_root_exports.py | 29 + 11 files changed, 13037 insertions(+), 2 deletions(-) create mode 100644 tests/custom/test_agentkit_agent.py create mode 100644 tests/custom/test_agentkit_session.py create mode 100644 tests/custom/test_agentkit_vendors.py create mode 100644 tests/custom/test_avatar_token.py create mode 100644 tests/custom/test_llm_vendors.py create mode 100644 tests/custom/test_root_exports.py diff --git a/.fern/replay.lock b/.fern/replay.lock index 536b6e4..f044bf9 100644 --- a/.fern/replay.lock +++ b/.fern/replay.lock @@ -6,5 +6,12089 @@ generations: timestamp: 2026-05-20T20:38:02.180Z cli_version: unknown generator_versions: {} -current_generation: a217c8ecfd919345831eebaca8295e292d65ebcf -patches: [] + - commit_sha: 477f40aa3df8b8d586a647685cf0e9686d7ce16f + tree_hash: 8dc76039ca5cfb9dc69697d34f5638c1e5e5002c + timestamp: 2026-06-02T01:43:39.293Z + cli_version: unknown + generator_versions: + fernapi/fern-python-sdk: 4.37.0 +current_generation: 477f40aa3df8b8d586a647685cf0e9686d7ce16f +patches: + - id: patch-6e30398b + content_hash: sha256:e99898e508e2d6cb9f134cc33e0b73c1c8acb845f5887924e0e38031a6e089c0 + original_commit: 6e30398b5dc6e8ff2681a442a4d6a49c7d866032 + original_message: "chore(agentkit): bump to v1.5.0 and expose v2.7 type aliases" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/__init__.py + - src/agora_agent/agentkit/vendors/__init__.py + patch_content: | + diff --git a/src/agora_agent/agentkit/__init__.py b/src/agora_agent/agentkit/__init__.py + index 1942bce..5ceda66 100644 + --- a/src/agora_agent/agentkit/__init__.py + +++ b/src/agora_agent/agentkit/__init__.py + @@ -1,13 +1,30 @@ + from .agent import ( + Agent, + + AgentConfig, + + AgentConfigUpdate, + + ConversationHistory, + + ConversationRole, + + ConversationSessionTurn, + + ConversationTurn, + + ConversationTurns, + StartAgentsRequestProperties, + + AvatarConfig, + + AvatarVendor, + GeofenceConfig, + + LlmConfig, + + LlmStyle, + + MllmConfig, + + MllmVendor, + RtcConfig, + + SttConfig, + + SttVendor, + + TtsConfig, + FillerWordsConfig, + FillerWordsTrigger, + FillerWordsTriggerFixedTimeConfig, + FillerWordsContent, + FillerWordsContentStaticConfig, + + FillerWordsContentSelectionRule, + TurnDetectionConfig, + TurnDetectionNestedConfig, + StartOfSpeechConfig, + @@ -37,9 +54,14 @@ from .agent import ( + InterruptionMode, + MllmTurnDetectionConfig, + MllmTurnDetectionMode, + + Labels, + LlmGreetingConfigs, + LlmGreetingConfigsMode, + McpServersItem, + + SessionInfo, + + SessionListResponse, + + SessionSummary, + + SpeakPriority, + ) + from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from ..agent_management.types.agent_think_agent_management_response import ( + @@ -57,8 +79,10 @@ from ..agent_management.types.agent_think_agent_management_request_on_speaking_a + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + @@ -112,6 +136,7 @@ from .vendors import ( + FishAudioTTS, + Gemini, + GeminiLive, + + GenericAvatar, + GoogleSTT, + GoogleTTS, + HeyGenAvatar, + @@ -132,14 +157,27 @@ from .vendors import ( + SarvamTTS, + SpeechmaticsSTT, + VertexAI, + + XaiGrok, + + XaiRealtime, + LiveAvatarAvatar, + ) + + __all__ = [ + "Agent", + + "AgentConfig", + + "AgentConfigUpdate", + # Return type of Agent.to_properties() + "StartAgentsRequestProperties", + # Top-level config types + + "LlmConfig", + + "LlmStyle", + + "SttConfig", + + "SttVendor", + + "TtsConfig", + + "MllmConfig", + + "MllmVendor", + + "AvatarConfig", + + "AvatarVendor", + "GeofenceConfig", + "RtcConfig", + "FillerWordsConfig", + @@ -147,6 +185,7 @@ __all__ = [ + "FillerWordsTriggerFixedTimeConfig", + "FillerWordsContent", + "FillerWordsContentStaticConfig", + + "FillerWordsContentSelectionRule", + # Turn detection types + "TurnDetectionConfig", + "TurnDetectionNestedConfig", + @@ -181,6 +220,7 @@ __all__ = [ + "InterruptionMode", + "MllmTurnDetectionConfig", + "MllmTurnDetectionMode", + + "Labels", + # Type-safe constants + "DataChannel", + "AudioScenario", + @@ -197,6 +237,15 @@ __all__ = [ + "AgentSession", + "AsyncAgentSession", + "AgentSessionOptions", + + "SessionInfo", + + "SessionListResponse", + + "SessionSummary", + + "ConversationHistory", + + "ConversationTurn", + + "ConversationRole", + + "ConversationTurns", + + "ConversationSessionTurn", + + "SpeakPriority", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + @@ -253,14 +302,19 @@ __all__ = [ + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + + "XaiGrok", + + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + + "GenericAvatar", + "is_heygen_avatar", + "is_live_avatar_avatar", + "is_akool_avatar", + "is_anam_avatar", + + "is_generic_avatar", + + "is_rtc_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + diff --git a/src/agora_agent/agentkit/vendors/__init__.py b/src/agora_agent/agentkit/vendors/__init__.py + index 0320843..689eab1 100644 + --- a/src/agora_agent/agentkit/vendors/__init__.py + +++ b/src/agora_agent/agentkit/vendors/__init__.py + @@ -11,9 +11,9 @@ from .base import ( + OpenAISampleRate, + SampleRate, + ) + -from .avatar import AkoolAvatar, AnamAvatar, HeyGenAvatar, LiveAvatarAvatar + +from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + -from .mllm import GeminiLive, OpenAIRealtime, VertexAI + +from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok, XaiRealtime + from .stt import ( + AmazonSTT, + AresSTT, + @@ -82,8 +82,11 @@ __all__ = [ + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + + "XaiGrok", + + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + + "GenericAvatar", + ] + theirs_snapshot: + src/agora_agent/agentkit/__init__.py: | + from .agent import ( + Agent, + AgentConfig, + AgentConfigUpdate, + ConversationHistory, + ConversationRole, + ConversationSessionTurn, + ConversationTurn, + ConversationTurns, + StartAgentsRequestProperties, + AvatarConfig, + AvatarVendor, + GeofenceConfig, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, + RtcConfig, + SttConfig, + SttVendor, + TtsConfig, + FillerWordsConfig, + FillerWordsTrigger, + FillerWordsTriggerFixedTimeConfig, + FillerWordsContent, + FillerWordsContentStaticConfig, + FillerWordsContentSelectionRule, + TurnDetectionConfig, + TurnDetectionNestedConfig, + StartOfSpeechConfig, + StartOfSpeechMode, + StartOfSpeechVadConfig, + StartOfSpeechKeywordsConfig, + StartOfSpeechDisabledConfig, + StartOfSpeechDisabledConfigStrategy, + EndOfSpeechConfig, + EndOfSpeechMode, + EndOfSpeechVadConfig, + EndOfSpeechSemanticConfig, + TurnDetectionType, + InterruptMode, + Eagerness, + SalConfig, + SalMode, + AdvancedFeatures, + SessionParams, + SessionParamsInput, + SilenceConfig, + SilenceAction, + FarewellConfig, + ParametersDataChannel, + ParametersAudioScenario, + InterruptionConfig, + InterruptionMode, + MllmTurnDetectionConfig, + MllmTurnDetectionMode, + Labels, + LlmGreetingConfigs, + LlmGreetingConfigsMode, + McpServersItem, + SessionInfo, + SessionListResponse, + SessionSummary, + SpeakPriority, + ) + from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .constants import ( + DataChannel, + AudioScenario, + SilenceActionValues, + SalModeValues, + GeofenceArea, + GeofenceExcludeArea, + FillerWordsSelectionRule, + TurnDetectionTypeValues, + ) + from .token import ( + GenerateConvoAITokenOptions, + GenerateTokenOptions, + MAX_EXPIRY_SECONDS, + generate_convo_ai_token, + generate_rtc_token, + expires_in_hours, + expires_in_minutes, + ) + from .presets import ( + AgentPresets, + DeepgramPresetModels, + MiniMaxPresetModels, + OpenAIPresetModels, + OpenAITtsPresetModels, + normalize_preset_input, + ) + from .vendors import ( + AkoolAvatar, + AmazonSTT, + AmazonTTS, + AnamAvatar, + Anthropic, + AresSTT, + AssemblyAISTT, + AzureOpenAI, + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + CartesiaTTS, + DeepgramSTT, + DeepgramTTS, + ElevenLabsSampleRate, + ElevenLabsTTS, + FishAudioTTS, + Gemini, + GeminiLive, + GenericAvatar, + GoogleSTT, + GoogleTTS, + HeyGenAvatar, + HumeAITTS, + MicrosoftSampleRate, + MicrosoftSTT, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAI, + OpenAIRealtime, + OpenAISampleRate, + OpenAISTT, + OpenAITTS, + RimeTTS, + SampleRate, + SarvamSTT, + SarvamTTS, + SpeechmaticsSTT, + VertexAI, + XaiGrok, + XaiRealtime, + LiveAvatarAvatar, + ) + + __all__ = [ + "Agent", + "AgentConfig", + "AgentConfigUpdate", + # Return type of Agent.to_properties() + "StartAgentsRequestProperties", + # Top-level config types + "LlmConfig", + "LlmStyle", + "SttConfig", + "SttVendor", + "TtsConfig", + "MllmConfig", + "MllmVendor", + "AvatarConfig", + "AvatarVendor", + "GeofenceConfig", + "RtcConfig", + "FillerWordsConfig", + "FillerWordsTrigger", + "FillerWordsTriggerFixedTimeConfig", + "FillerWordsContent", + "FillerWordsContentStaticConfig", + "FillerWordsContentSelectionRule", + # Turn detection types + "TurnDetectionConfig", + "TurnDetectionNestedConfig", + "StartOfSpeechConfig", + "StartOfSpeechMode", + "StartOfSpeechVadConfig", + "StartOfSpeechKeywordsConfig", + "StartOfSpeechDisabledConfig", + "StartOfSpeechDisabledConfigStrategy", + "EndOfSpeechConfig", + "EndOfSpeechMode", + "EndOfSpeechVadConfig", + "EndOfSpeechSemanticConfig", + # Deprecated turn detection types + "TurnDetectionType", + "InterruptMode", + "Eagerness", + # SAL types + "SalConfig", + "SalMode", + # Advanced features + "AdvancedFeatures", + # Session parameters types + "SessionParams", + "SessionParamsInput", + "SilenceConfig", + "SilenceAction", + "FarewellConfig", + "ParametersDataChannel", + "ParametersAudioScenario", + "InterruptionConfig", + "InterruptionMode", + "MllmTurnDetectionConfig", + "MllmTurnDetectionMode", + "Labels", + # Type-safe constants + "DataChannel", + "AudioScenario", + "SilenceActionValues", + "SalModeValues", + "GeofenceArea", + "GeofenceExcludeArea", + "FillerWordsSelectionRule", + "TurnDetectionTypeValues", + # LLM sub-types + "LlmGreetingConfigs", + "LlmGreetingConfigsMode", + "McpServersItem", + "AgentSession", + "AsyncAgentSession", + "AgentSessionOptions", + "SessionInfo", + "SessionListResponse", + "SessionSummary", + "ConversationHistory", + "ConversationTurn", + "ConversationRole", + "ConversationTurns", + "ConversationSessionTurn", + "SpeakPriority", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + "AgentThinkRequestOnSpeakingAction", + "AgentPresets", + "DeepgramPresetModels", + "OpenAIPresetModels", + "OpenAITtsPresetModels", + "MiniMaxPresetModels", + "normalize_preset_input", + "generate_rtc_token", + "GenerateTokenOptions", + "generate_convo_ai_token", + "GenerateConvoAITokenOptions", + "MAX_EXPIRY_SECONDS", + "expires_in_hours", + "expires_in_minutes", + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + "is_heygen_avatar", + "is_live_avatar_avatar", + "is_akool_avatar", + "is_anam_avatar", + "is_generic_avatar", + "is_rtc_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + src/agora_agent/agentkit/vendors/__init__.py: | + from .base import ( + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + ElevenLabsSampleRate, + GoogleTTSSampleRate, + MicrosoftSampleRate, + OpenAISampleRate, + SampleRate, + ) + from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok, XaiRealtime + from .stt import ( + AmazonSTT, + AresSTT, + AssemblyAISTT, + DeepgramSTT, + GoogleSTT, + MicrosoftSTT, + OpenAISTT, + SarvamSTT, + SpeechmaticsSTT, + ) + from .tts import ( + AmazonTTS, + CartesiaTTS, + DeepgramTTS, + ElevenLabsTTS, + FishAudioTTS, + GoogleTTS, + HumeAITTS, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAITTS, + RimeTTS, + SarvamTTS, + ) + + __all__ = [ + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "GoogleTTSSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + ] + status: unresolved + - id: patch-9df782b4 + content_hash: sha256:84c08fe3239d2ecb0b0a3ddd33b0dce4e7b012125be797aa83ca12893363b565 + original_commit: 9df782b46d872599f103078e30c5ded2053f2517 + original_message: "feat(agentkit): update MLLM and LLM vendor wrappers for v2.7" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/vendors/mllm.py + patch_content: |+ + From 9df782b46d872599f103078e30c5ded2053f2517 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 20 May 2026 20:57:54 -0400 + Subject: [PATCH] feat(agentkit): update MLLM and LLM vendor wrappers for v2.7 + + Adds xAI Grok Realtime and Vertex AI MLLM wrappers, and aligns MLLM + config serialization with the generated core types. LLM vendors now + accept typed greeting_configs and serialize them through the generated + model shape, including interruptable. + --- + src/agora_agent/agentkit/vendors/llm.py | 31 ++++-- + src/agora_agent/agentkit/vendors/mllm.py | 118 +++++++++++++++++------ + 2 files changed, 113 insertions(+), 36 deletions(-) + + diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py + index 7465c9f..6f74b43 100644 + --- a/src/agora_agent/agentkit/vendors/llm.py + +++ b/src/agora_agent/agentkit/vendors/llm.py + @@ -1,9 +1,14 @@ + -from typing import Any, Dict, List, Optional + +from typing import Any, Dict, List, Optional, Union + + from pydantic import BaseModel, ConfigDict, Field + + +from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + + StartAgentsRequestPropertiesLlmGreetingConfigs, + +) + from .base import BaseLLM + + +LlmGreetingConfigs = Union[StartAgentsRequestPropertiesLlmGreetingConfigs, Dict[str, Any]] + + + + def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Ensure each MCP server has transport set (API requires it). Default to streamable_http.""" + @@ -15,6 +20,14 @@ def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]] + result.append(item) + return result + + + + +def _dump_optional_model(value: Any) -> Any: + + if hasattr(value, "model_dump"): + + return value.model_dump(exclude_none=True) + + if hasattr(value, "dict"): + + return value.dict(exclude_none=True) + + return value + + + class OpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + @@ -31,7 +44,7 @@ class OpenAIOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -74,7 +87,7 @@ class OpenAI(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + @@ -104,7 +117,7 @@ class AzureOpenAIOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -150,7 +163,7 @@ class AzureOpenAI(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.mcp_servers is not None: + @@ -177,7 +190,7 @@ class AnthropicOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -216,7 +229,7 @@ class Anthropic(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + @@ -246,7 +259,7 @@ class GeminiOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -287,7 +300,7 @@ class Gemini(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py + index 5f6f940..cd6cd07 100644 + --- a/src/agora_agent/agentkit/vendors/mllm.py + +++ b/src/agora_agent/agentkit/vendors/mllm.py + @@ -1,3 +1,4 @@ + +import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + @@ -22,9 +23,7 @@ class OpenAIRealtimeOptions(BaseModel): + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + @@ -53,18 +52,97 @@ class OpenAIRealtime(BaseMLLM): + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + - if self.options.predefined_tools is not None: + - config["predefined_tools"] = self.options.predefined_tools + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + - if self.options.max_history is not None: + - config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + +# xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + +# is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + + + +class XaiGrokOptions(BaseModel): + + model_config = ConfigDict(extra="forbid") + + + + api_key: str = Field(..., description="xAI API key") + + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + + + +class XaiGrok(BaseMLLM): + + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + + + def __init__(self, **kwargs: Any): + + self.options = XaiGrokOptions(**kwargs) + + + + def to_config(self) -> Dict[str, Any]: + + params: Dict[str, Any] = dict(self.options.params or {}) + + if self.options.voice is not None: + + params["voice"] = self.options.voice + + if self.options.language is not None: + + params["language"] = self.options.language + + if self.options.sample_rate is not None: + + params["sample_rate"] = self.options.sample_rate + + + + config: Dict[str, Any] = { + + "vendor": "xai", + + "api_key": self.options.api_key, + + "url": self.options.url, + + "params": params, + + } + + + + if self.options.greeting_message is not None: + + config["greeting_message"] = self.options.greeting_message + + if self.options.input_modalities is not None: + + config["input_modalities"] = self.options.input_modalities + + if self.options.output_modalities is not None: + + config["output_modalities"] = self.options.output_modalities + + if self.options.messages is not None: + + config["messages"] = self.options.messages + + if self.options.failure_message is not None: + + config["failure_message"] = self.options.failure_message + + if self.options.turn_detection is not None: + + config["turn_detection"] = self.options.turn_detection + + + + return config + + + + + +class XaiRealtimeOptions(XaiGrokOptions): + + """Deprecated: use :class:`XaiGrokOptions` instead.""" + + + + def __init__(self, **data: Any): + + warnings.warn( + + "XaiRealtimeOptions is deprecated; use XaiGrokOptions instead.", + + DeprecationWarning, + + stacklevel=2, + + ) + + super().__init__(**data) + + + + + +class XaiRealtime(XaiGrok): + + """Deprecated: use :class:`XaiGrok` instead.""" + + + + def __init__(self, **kwargs: Any): + + warnings.warn( + + "XaiRealtime is deprecated; use XaiGrok instead.", + + DeprecationWarning, + + stacklevel=2, + + ) + + super().__init__(**kwargs) + + + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + @@ -81,28 +159,24 @@ class VertexAIOptions(BaseModel): + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + - params: Dict[str, Any] = { + - "model": self.options.model, + - "project_id": self.options.project_id, + - "location": self.options.location, + - "adc_credentials_string": self.options.adc_credentials_string, + - } + - + + # additional_params spread first so that explicit fields always win, + + # matching the TypeScript SDK. + + params: Dict[str, Any] = dict(self.options.additional_params or {}) + + params["model"] = self.options.model + + params["project_id"] = self.options.project_id + + params["location"] = self.options.location + + params["adc_credentials_string"] = self.options.adc_credentials_string + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + - if self.options.additional_params is not None: + - params.update(self.options.additional_params) + + config: Dict[str, Any] = { + "vendor": "vertexai", + @@ -119,12 +193,8 @@ class VertexAI(BaseMLLM): + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + - if self.options.predefined_tools is not None: + - config["predefined_tools"] = self.options.predefined_tools + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + - if self.options.max_history is not None: + - config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + @@ -145,9 +215,7 @@ class GeminiLiveOptions(BaseModel): + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + @@ -179,12 +247,8 @@ class GeminiLive(BaseMLLM): + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + - if self.options.predefined_tools is not None: + - config["predefined_tools"] = self.options.predefined_tools + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + - if self.options.max_history is not None: + - config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/vendors/llm.py: | + from typing import Any, Dict, List, Optional, Union + + from pydantic import BaseModel, ConfigDict, Field + + from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + StartAgentsRequestPropertiesLlmGreetingConfigs, + ) + from .base import BaseLLM + + LlmGreetingConfigs = Union[StartAgentsRequestPropertiesLlmGreetingConfigs, Dict[str, Any]] + + + def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Ensure each MCP server has transport set (API requires it). Default to streamable_http.""" + result = [] + for s in servers: + item = dict(s) + if item.get("transport") is None: + item["transport"] = "streamable_http" + result.append(item) + return result + + + def _dump_optional_model(value: Any) -> Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + class OpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: Optional[str] = Field(default=None, description="OpenAI API key") + model: str = Field(default="gpt-4o-mini", description="Model name") + base_url: Optional[str] = Field(default=None, description="Custom base URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + max_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class OpenAI(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # model is the default; explicit params entries extend/override it. + # This matches the TS SDK behaviour: { model, ...params }. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + + # Named fields take precedence over anything in the generic params dict. + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.base_url or "https://api.openai.com/v1/chat/completions", + "params": params, + "style": "openai", + "input_modalities": self.options.input_modalities or ["text"], + } + if self.options.api_key is not None: + config["api_key"] = self.options.api_key + if self.options.headers is not None: + config["headers"] = self.options.headers + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class AzureOpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Azure OpenAI API key") + endpoint: str = Field(..., description="Azure endpoint URL") + deployment_name: str = Field(..., description="Azure deployment name") + api_version: str = Field(default="2024-08-01-preview", description="Azure API version") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + max_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class AzureOpenAI(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AzureOpenAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + url = ( + f"{self.options.endpoint}/openai/deployments/" + f"{self.options.deployment_name}/chat/completions" + f"?api-version={self.options.api_version}" + ) + config: Dict[str, Any] = { + "url": url, + "api_key": self.options.api_key, + "vendor": self.options.vendor or "azure", + "style": "openai", + "input_modalities": self.options.input_modalities or ["text"], + } + + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if params: + config["params"] = params + if self.options.headers is not None: + config["headers"] = self.options.headers + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class AnthropicOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anthropic API key") + model: str = Field(default="claude-3-5-sonnet-20241022", description="Model name") + url: Optional[str] = Field(default=None, description="Custom API endpoint URL") + max_tokens: Optional[int] = Field(default=None, gt=0) + temperature: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class Anthropic(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AnthropicOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.url or "https://api.anthropic.com/v1/messages", + "api_key": self.options.api_key, + "params": params, + "style": "anthropic", + "input_modalities": self.options.input_modalities or ["text"], + } + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class GeminiOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google AI API key") + model: str = Field(default="gemini-2.0-flash-exp", description="Model name") + url: Optional[str] = Field(default=None, description="Custom API endpoint URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_k: Optional[int] = Field(default=None, gt=0) + max_output_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class Gemini(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + if self.options.top_k is not None: + params["top_k"] = self.options.top_k + if self.options.max_output_tokens is not None: + params["max_output_tokens"] = self.options.max_output_tokens + + config: Dict[str, Any] = { + "url": self.options.url or "https://generativelanguage.googleapis.com/v1beta/models", + "api_key": self.options.api_key, + "params": params, + "style": "gemini", + "input_modalities": self.options.input_modalities or ["text"], + } + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + src/agora_agent/agentkit/vendors/mllm.py: | + import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + + from ...agents.types.start_agents_request_properties_mllm_turn_detection import ( + StartAgentsRequestPropertiesMllmTurnDetection, + ) + from .base import BaseMLLM + + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + + + class OpenAIRealtimeOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="OpenAI API key") + model: Optional[str] = Field(default=None, description="Model name (e.g., gpt-4o-realtime-preview)") + url: Optional[str] = Field(default=None, description="WebSocket URL") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIRealtimeOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config: Dict[str, Any] = { + "vendor": "openai", + "api_key": self.options.api_key, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.model is not None: + params = {"model": self.options.model} + if self.options.params is not None: + params.update(self.options.params) + config["params"] = params + elif self.options.params is not None: + config["params"] = self.options.params + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + # xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + # is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + class XaiGrokOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="xAI API key") + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + class XaiGrok(BaseMLLM): + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + def __init__(self, **kwargs: Any): + self.options = XaiGrokOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.language is not None: + params["language"] = self.options.language + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate + + config: Dict[str, Any] = { + "vendor": "xai", + "api_key": self.options.api_key, + "url": self.options.url, + "params": params, + } + + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class XaiRealtimeOptions(XaiGrokOptions): + """Deprecated: use :class:`XaiGrokOptions` instead.""" + + def __init__(self, **data: Any): + warnings.warn( + "XaiRealtimeOptions is deprecated; use XaiGrokOptions instead.", + DeprecationWarning, + stacklevel=2, + ) + super().__init__(**data) + + + class XaiRealtime(XaiGrok): + """Deprecated: use :class:`XaiGrok` instead.""" + + def __init__(self, **kwargs: Any): + warnings.warn( + "XaiRealtime is deprecated; use XaiGrok instead.", + DeprecationWarning, + stacklevel=2, + ) + super().__init__(**kwargs) + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + model: str = Field(..., description="Model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud location/region") + adc_credentials_string: str = Field(..., description="Application Default Credentials JSON string") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name (e.g., Aoede, Charon)") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # additional_params spread first so that explicit fields always win, + # matching the TypeScript SDK. + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["model"] = self.options.model + params["project_id"] = self.options.project_id + params["location"] = self.options.location + params["adc_credentials_string"] = self.options.adc_credentials_string + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "vertexai", + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class GeminiLiveOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google API key") + model: str = Field(..., description="Gemini Live model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiLiveOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = {} + if self.options.additional_params is not None: + params.update(self.options.additional_params) + params["model"] = self.options.model + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "gemini", + "api_key": self.options.api_key, + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + status: unresolved + - id: patch-26706d73 + content_hash: sha256:a9551e0b774b96e7734e9faa7d770611861cf443837428272ef75710447238da + original_commit: 26706d73ae15d860d57daf926837632c01be7f10 + original_message: "feat(agentkit): add GenericAvatar and session-aware avatar validation" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/avatar_types.py + - src/agora_agent/agentkit/vendors/avatar.py + patch_content: |+ + From 26706d73ae15d860d57daf926837632c01be7f10 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 20 May 2026 20:59:22 -0400 + Subject: [PATCH] feat(agentkit): add GenericAvatar and session-aware avatar + validation + + Adds the GenericAvatar vendor wrapper and extends avatar validation + helpers for generic and RTC-backed avatars. Session-derived fields such + as agora_appid, agora_channel, and agora_token can now be validated + after AgentSession enrichment. + --- + src/agora_agent/agentkit/avatar_types.py | 35 +++++++++++++++++- + src/agora_agent/agentkit/vendors/avatar.py | 42 ++++++++++++++++++++++ + 2 files changed, 76 insertions(+), 1 deletion(-) + + diff --git a/src/agora_agent/agentkit/avatar_types.py b/src/agora_agent/agentkit/avatar_types.py + index 9e132a9..a04809c 100644 + --- a/src/agora_agent/agentkit/avatar_types.py + +++ b/src/agora_agent/agentkit/avatar_types.py + @@ -17,7 +17,21 @@ def is_anam_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "anam" + + + -def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: + +def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + + return config.get("vendor") == "generic" + + + + + +def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + + params = config.get("params", {}) + + return isinstance(params, dict) and bool(params.get("agora_uid")) and ( + + is_heygen_avatar(config) or is_live_avatar_avatar(config) or is_generic_avatar(config) + + ) + + + + + +def validate_avatar_config( + + config: typing.Dict[str, typing.Any], + + require_session_fields: bool = False, + +) -> None: + """Validates avatar configuration at runtime. + + Parameters + @@ -45,6 +59,8 @@ def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: + f"Invalid quality for {label}: {params.get('quality')}. " + f"Must be one of: {', '.join(valid_qualities)}" + ) + + if require_session_fields and not params.get("agora_token"): + + raise ValueError(f"{label} avatar requires agora_token after session enrichment") + elif is_akool_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + @@ -53,6 +69,23 @@ def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Anam avatar requires api_key") + + elif is_generic_avatar(config): + + params = config.get("params", {}) + + if not params.get("api_key"): + + raise ValueError("Generic avatar requires api_key") + + if not params.get("api_base_url"): + + raise ValueError("Generic avatar requires api_base_url") + + if not params.get("avatar_id"): + + raise ValueError("Generic avatar requires avatar_id") + + if not params.get("agora_uid"): + + raise ValueError("Generic avatar requires agora_uid") + + if require_session_fields: + + if not params.get("agora_token"): + + raise ValueError("Generic avatar requires agora_token after session enrichment") + + if not params.get("agora_appid"): + + raise ValueError("Generic avatar requires agora_appid after session enrichment") + + if not params.get("agora_channel"): + + raise ValueError("Generic avatar requires agora_channel after session enrichment") + + + def validate_tts_sample_rate( + diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py + index b83a356..00cad8f 100644 + --- a/src/agora_agent/agentkit/vendors/avatar.py + +++ b/src/agora_agent/agentkit/vendors/avatar.py + @@ -132,6 +132,48 @@ class LiveAvatarAvatar(BaseAvatar): + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + +class GenericAvatarOptions(BaseModel): + + model_config = ConfigDict(extra="forbid") + + + + api_key: str = Field(..., description="Generic avatar provider API key") + + api_base_url: str = Field(..., description="Avatar provider API base URL") + + avatar_id: str = Field(..., description="Avatar ID") + + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + +class GenericAvatar(BaseAvatar): + + def __init__(self, **kwargs: Any): + + self.options = GenericAvatarOptions(**kwargs) + + + + @property + + def required_sample_rate(self) -> int: + + return 0 + + + + def to_config(self) -> Dict[str, Any]: + + params: Dict[str, Any] = { + + "api_key": self.options.api_key, + + "api_base_url": self.options.api_base_url, + + "avatar_id": self.options.avatar_id, + + "agora_uid": self.options.agora_uid, + + } + + + + if self.options.agora_appid is not None: + + params["agora_appid"] = self.options.agora_appid + + if self.options.agora_token is not None: + + params["agora_token"] = self.options.agora_token + + if self.options.agora_channel is not None: + + params["agora_channel"] = self.options.agora_channel + + if self.options.additional_params is not None: + + params = {**self.options.additional_params, **params} + + + + enable = self.options.enable if self.options.enable is not None else True + + return {"enable": enable, "vendor": "generic", "params": params} + + + + + class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/avatar_types.py: | + import typing + + + def is_heygen_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "heygen" + + + def is_live_avatar_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "liveavatar" + + + def is_akool_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "akool" + + + def is_anam_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "anam" + + + def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "generic" + + + def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + params = config.get("params", {}) + return isinstance(params, dict) and bool(params.get("agora_uid")) and ( + is_heygen_avatar(config) or is_live_avatar_avatar(config) or is_generic_avatar(config) + ) + + + def validate_avatar_config( + config: typing.Dict[str, typing.Any], + require_session_fields: bool = False, + ) -> None: + """Validates avatar configuration at runtime. + + Parameters + ---------- + config : dict + The avatar configuration dictionary. + + Raises + ------ + ValueError + If the configuration is invalid. + """ + if is_heygen_avatar(config) or is_live_avatar_avatar(config): + label = "HeyGen" if is_heygen_avatar(config) else "LiveAvatar" + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError(f"{label} avatar requires api_key") + if not params.get("quality"): + raise ValueError(f"{label} avatar requires quality (low, medium, or high)") + if not params.get("agora_uid"): + raise ValueError(f"{label} avatar requires agora_uid") + valid_qualities = ("low", "medium", "high") + if params.get("quality") not in valid_qualities: + raise ValueError( + f"Invalid quality for {label}: {params.get('quality')}. " + f"Must be one of: {', '.join(valid_qualities)}" + ) + if require_session_fields and not params.get("agora_token"): + raise ValueError(f"{label} avatar requires agora_token after session enrichment") + elif is_akool_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Akool avatar requires api_key") + elif is_anam_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Anam avatar requires api_key") + elif is_generic_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Generic avatar requires api_key") + if not params.get("api_base_url"): + raise ValueError("Generic avatar requires api_base_url") + if not params.get("avatar_id"): + raise ValueError("Generic avatar requires avatar_id") + if not params.get("agora_uid"): + raise ValueError("Generic avatar requires agora_uid") + if require_session_fields: + if not params.get("agora_token"): + raise ValueError("Generic avatar requires agora_token after session enrichment") + if not params.get("agora_appid"): + raise ValueError("Generic avatar requires agora_appid after session enrichment") + if not params.get("agora_channel"): + raise ValueError("Generic avatar requires agora_channel after session enrichment") + + + def validate_tts_sample_rate( + avatar_config: typing.Dict[str, typing.Any], + tts_sample_rate: int, + ) -> None: + """Validates that TTS sample rate is compatible with the avatar vendor. + + Different avatar vendors have specific sample rate requirements: + - HeyGen: ONLY supports 24,000 Hz + - Akool: ONLY supports 16,000 Hz + + Parameters + ---------- + avatar_config : dict + The avatar configuration dictionary. + tts_sample_rate : int + The sample rate from your TTS configuration (in Hz). + + Raises + ------ + ValueError + If TTS sample rate is incompatible with the avatar vendor. + """ + if is_heygen_avatar(avatar_config) or is_live_avatar_avatar(avatar_config): + if tts_sample_rate != 24000: + label = "HeyGen" if is_heygen_avatar(avatar_config) else "LiveAvatar" + raise ValueError( + f"{label} avatars ONLY support 24,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 24kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/overview" + ) + elif is_akool_avatar(avatar_config): + if tts_sample_rate != 16000: + raise ValueError( + f"Akool avatars ONLY support 16,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 16kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/akool" + ) + src/agora_agent/agentkit/vendors/avatar.py: | + import warnings + from typing import Any, Dict, Optional + + from pydantic import BaseModel, ConfigDict, Field, field_validator + + from .base import BaseAvatar + + HEYGEN_SAMPLE_RATE = 24000 + LIVEAVATAR_SAMPLE_RATE = 24000 + AKOOL_SAMPLE_RATE = 16000 + + + class HeyGenAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="HeyGen API key") + quality: str = Field(..., description="Avatar quality: low, medium, or high") + agora_uid: str = Field(..., description="Agora UID for the avatar stream") + agora_token: Optional[str] = Field(default=None, description="RTC token for avatar authentication") + avatar_id: Optional[str] = Field(default=None, description="HeyGen avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + disable_idle_timeout: Optional[bool] = Field(default=None, description="Whether to disable idle timeout") + activity_idle_timeout: Optional[int] = Field(default=None, description="Idle timeout in seconds") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + @field_validator("quality") + @classmethod + def validate_quality(cls, v: str) -> str: + valid = ("low", "medium", "high") + if v not in valid: + raise ValueError(f"Invalid quality '{v}'. Must be one of: {', '.join(valid)}") + return v + + class HeyGenAvatar(BaseAvatar): + """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + def __init__(self, **kwargs: Any): + warnings.warn( + "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + DeprecationWarning, + stacklevel=2, + ) + self.options = HeyGenAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return HEYGEN_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "heygen", "params": params} + + + class AkoolAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Akool API key") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + class AkoolAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AkoolAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return AKOOL_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "akool", "params": params} + + + class LiveAvatarAvatarOptions(HeyGenAvatarOptions): + pass + + + class LiveAvatarAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = LiveAvatarAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return LIVEAVATAR_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + class GenericAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Generic avatar provider API key") + api_base_url: str = Field(..., description="Avatar provider API base URL") + avatar_id: str = Field(..., description="Avatar ID") + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "api_base_url": self.options.api_base_url, + "avatar_id": self.options.avatar_id, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_appid is not None: + params["agora_appid"] = self.options.agora_appid + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.agora_channel is not None: + params["agora_channel"] = self.options.agora_channel + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "generic", "params": params} + + + class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anam API key") + persona_id: Optional[str] = Field(default=None, description="Persona ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + class AnamAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AnamAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.persona_id is not None: + params["persona_id"] = self.options.persona_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "anam", "params": params} + status: unresolved + - id: patch-9f491c63 + content_hash: sha256:d9811b2c5927be74f2125444dcf36642b88ad7be422019688cb0228093dce1d0 + original_commit: 9f491c63a964c13c67ba4af3708379e1b75a92d8 + original_message: "feat(agentkit): update Agent builder and session lifecycle for v2.7" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + patch_content: |+ + From 9f491c63a964c13c67ba4af3708379e1b75a92d8 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 20 May 2026 21:00:58 -0400 + Subject: [PATCH] feat(agentkit): update Agent builder and session lifecycle + for v2.7 + + Aligns Agent and AgentSession with the generated v2.7 request shape. + MLLM sessions no longer require TTS, LLM, or STT, and enabled avatars + are rejected when MLLM is configured. AgentSession now enriches generic + and RTC avatars with session context, auto-generates avatar tokens, + validates TTS sample rates from vendor-specific fields, and adds + paginated get_turns/get_all_turns helpers with fail-fast pagination + guards. + --- + src/agora_agent/agentkit/agent.py | 164 +++++++++++++-- + src/agora_agent/agentkit/agent_session.py | 231 ++++++++++++++++++++-- + 2 files changed, 360 insertions(+), 35 deletions(-) + + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index 70a1bdd..86a958e 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -8,6 +8,24 @@ if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + +from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + +from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + +from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + +from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + +from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + +from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + +from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + +from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + +from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + +from ..agents.types.get_agents_response import GetAgentsResponse + +from ..agents.types.list_agents_response import ListAgentsResponse + +from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + +from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + +from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + +from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + +from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + +from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + +from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + @@ -46,10 +64,21 @@ from ..agents.types.start_agents_request_properties_filler_words_trigger import + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + +from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + +from ..types.tts import Tts + from .token import generate_convo_ai_token, _validate_expires_in + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + +LlmConfig = StartAgentsRequestPropertiesLlm + +LlmStyle = StartAgentsRequestPropertiesLlmStyle + +SttConfig = StartAgentsRequestPropertiesAsr + +SttVendor = StartAgentsRequestPropertiesAsrVendor + +TtsConfig = Tts + +MllmConfig = StartAgentsRequestPropertiesMllm + +MllmVendor = StartAgentsRequestPropertiesMllmVendor + +AvatarConfig = StartAgentsRequestPropertiesAvatar + +AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + @@ -93,6 +122,18 @@ InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + +AgentConfig = StartAgentsRequestProperties + +AgentConfigUpdate = UpdateAgentsRequestProperties + +SessionInfo = GetAgentsResponse + +SessionListResponse = ListAgentsResponse + +SessionSummary = ListAgentsResponseDataListItem + +ConversationHistory = GetHistoryAgentsResponse + +ConversationTurn = GetHistoryAgentsResponseContentsItem + +ConversationRole = GetHistoryAgentsResponseContentsItemRole + +ConversationTurns = GetTurnsAgentsResponse + +ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + +SpeakPriority = SpeakAgentsRequestPriority + +Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + @@ -116,6 +157,7 @@ FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + +FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + + class Agent: + @@ -183,9 +225,20 @@ class Agent: + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + + sample_rate = vendor.sample_rate + + if ( + + self._avatar_required_sample_rate not in (None, 0) + + and sample_rate is not None + + and sample_rate != self._avatar_required_sample_rate + + ): + + raise ValueError( + + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + + f"but TTS is configured with {sample_rate} Hz. " + + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + - new_agent._tts_sample_rate = vendor.sample_rate + + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + @@ -194,6 +247,9 @@ class Agent: + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + + # Note: avatars are not supported with MLLM. The combination is rejected + + # at ``to_properties`` / ``AgentSession.start`` so callers can still + + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + @@ -202,7 +258,10 @@ class Agent: + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + - advanced_features_model = new_agent._advanced_features.model_copy(update={"enable_mllm": None}) + + advanced_features_model = self._copy_model_update( + + new_agent._advanced_features, + + {"enable_mllm": None}, + + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + @@ -214,6 +273,10 @@ class Agent: + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + + # Note: avatars are not supported with MLLM. The combination is rejected + + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + + # enabled) so callers may still combine the two for testing or for the + + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + @@ -282,7 +345,10 @@ class Agent: + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + - new_agent._advanced_features = new_agent._advanced_features.model_copy(update={"enable_tools": enabled}) + + new_agent._advanced_features = self._copy_model_update( + + new_agent._advanced_features, + + {"enable_tools": enabled}, + + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + @@ -294,6 +360,23 @@ class Agent: + new_agent._parameters = parameters + return new_agent + + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + + """Returns a new Agent with the specified RTC audio scenario.""" + + new_agent = self._clone() + + if new_agent._parameters is None: + + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + + elif isinstance(new_agent._parameters, dict): + + new_agent._parameters = typing.cast( + + SessionParamsInput, + + {**new_agent._parameters, "audio_scenario": audio_scenario}, + + ) + + else: + + new_agent._parameters = self._copy_model_update( + + new_agent._parameters, + + {"audio_scenario": audio_scenario}, + + ) + + return new_agent + + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + @@ -342,6 +425,33 @@ class Agent: + new_agent._filler_words = filler_words + return new_agent + + + @staticmethod + + def _field_value(value: typing.Any, field: str) -> typing.Any: + + if value is None: + + return None + + if isinstance(value, dict): + + return value.get(field) + + return getattr(value, field, None) + + + + @staticmethod + + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + + if hasattr(value, "model_copy"): + + return value.model_copy(update=update) + + if hasattr(value, "copy"): + + return value.copy(update=update) + + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + + data_channel = self._field_value(self._parameters, "data_channel") + + if not enable_rtm or data_channel is not None: + + return self._parameters + + if self._parameters is None: + + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + + if isinstance(self._parameters, dict): + + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + + @property + def name(self) -> typing.Optional[str]: + return self._name + @@ -354,6 +464,10 @@ class Agent: + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + + @property + + def tts_sample_rate(self) -> typing.Optional[int]: + + return self._tts_sample_rate + + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + @@ -536,6 +650,20 @@ class Agent: + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + + # Validate the MLLM + enabled-avatar combination BEFORE generating the + + # RTC token so callers get a clear, actionable error first (matches the + + # TypeScript and Go SDKs' fail-fast contract). + + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + + avatar_enabled = ( + + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + + ) + + if is_mllm_mode and avatar_enabled: + + raise ValueError( + + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + + ) + + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + @@ -553,9 +681,6 @@ class Agent: + **token_kwargs, + ) + + - mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + - is_mllm_mode = bool(mllm_flag or self._mllm is not None) + - + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + @@ -579,11 +704,12 @@ class Agent: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + - if self._parameters is not None: + - if isinstance(self._parameters, dict): + - base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**self._parameters) + + parameters = self._resolved_parameters() + + if parameters is not None: + + if isinstance(parameters, dict): + + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + - base_kwargs["parameters"] = self._parameters + + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + @@ -596,12 +722,10 @@ class Agent: + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + - if self._greeting: + + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + - if self._failure_message: + + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + - if self._max_history is not None: + - mllm_config.setdefault("max_history", self._max_history) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + @@ -617,14 +741,14 @@ class Agent: + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + - if self._instructions: + + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + - if self._greeting: + - llm_config.setdefault("greeting_message", self._greeting) + - if self._failure_message: + - llm_config.setdefault("failure_message", self._failure_message) + + if self._greeting is not None: + + llm_config["greeting_message"] = self._greeting + + if self._failure_message is not None: + + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + - llm_config.setdefault("max_history", self._max_history) + + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index 2408659..e41a399 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -14,13 +14,16 @@ from ..agent_management.types.agent_think_agent_management_request_on_thinking_a + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + +from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + @@ -182,17 +185,29 @@ class _AgentSessionBase: + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + + if self._is_mllm_mode(): + + raise ValueError( + + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + - sample_rate = tts_params.get("sample_rate") if isinstance(tts_params, dict) else None + + sample_rate = self._agent.tts_sample_rate + + if sample_rate is None and isinstance(tts_params, dict): + + sample_rate = ( + + tts_params.get("sample_rate") + + or tts_params.get("sample_rate_hertz") + + or tts_params.get("samplingRate") + + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + @@ -211,6 +226,50 @@ class _AgentSessionBase: + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + + avatar = properties.get("avatar") + + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + + return + + + + params = avatar.get("params") + + if not isinstance(params, dict): + + params = {} + + avatar["params"] = params + + + + if is_generic_avatar(avatar): + + if not params.get("agora_appid"): + + params["agora_appid"] = self._app_id + + if not params.get("agora_channel"): + + params["agora_channel"] = self._channel + + + + if not is_rtc_avatar(avatar): + + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + + return + + + + if not params.get("agora_token"): + + if not self._app_certificate: + + raise ValueError( + + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + + ) + + token_kwargs: typing.Dict[str, typing.Any] = {} + + if self._expires_in is not None: + + token_kwargs["token_expire"] = self._expires_in + + params["agora_token"] = generate_convo_ai_token( + + app_id=self._app_id, + + app_certificate=self._app_certificate, + + channel_name=self._channel, + + account=str(params["agora_uid"]), + + **token_kwargs, + + ) + + + + if str(params.get("agora_uid")) == self._agent_uid: + + self._warn( + + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + + ) + + + + validate_avatar_config(avatar, require_session_fields=True) + + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + @@ -238,12 +297,17 @@ class _AgentSessionBase: + **token_opts, + ) + properties = self._dump_model(base_properties) + + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + - mllm = dict(self._agent.mllm) + - if self._agent.greeting: + + mllm = self._dump_model(self._agent.mllm) + + if not isinstance(mllm, dict): + + mllm = {} + + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + + if self._agent.failure_message is not None: + + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + @@ -251,20 +315,41 @@ class _AgentSessionBase: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + - if self._agent.instructions: + + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + - if self._agent.greeting: + - llm.setdefault("greeting_message", self._agent.greeting) + - if self._agent.failure_message: + - llm.setdefault("failure_message", self._agent.failure_message) + + if self._agent.greeting is not None: + + llm["greeting_message"] = self._agent.greeting + + if self._agent.failure_message is not None: + + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + - llm.setdefault("max_history", self._agent.max_history) + + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + + @staticmethod + + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + + if pagination is None: + + return None + + if isinstance(pagination, dict): + + return pagination.get(field) + + return getattr(pagination, field, None) + + + + @staticmethod + + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + + return list(turns or []) + + + + @classmethod + + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + + data = cls._dump_model(first_response) + + if not isinstance(data, dict): + + data = {} + + data["turns"] = turns + + return GetTurnsAgentsResponse(**data) + + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + @@ -484,7 +569,12 @@ class AgentSession(_AgentSessionBase): + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + - """Inject a custom text instruction into the current session pipeline.""" + + """Inject a custom text instruction into the current session pipeline. + + + + In API v2.7, omitting ``on_listening_action`` uses the server default + + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + + preserve the pre-v2.7 behavior. + + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + @@ -547,15 +637,68 @@ class AgentSession(_AgentSessionBase): + self._app_id, self._agent_id, request_options=self._request_options() + ) + + - def get_turns(self) -> typing.Any: + + def get_turns( + + self, + + *, + + page_index: typing.Optional[int] = None, + + page_size: typing.Optional[int] = None, + + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_turns( + - self._app_id, self._agent_id, request_options=self._request_options() + + self._app_id, + + self._agent_id, + + page_index=page_index, + + page_size=page_size, + + request_options=self._request_options(), + ) + + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + + """Get all turn analytics pages for this session. + + + + Raises ``RuntimeError`` if the server's pagination metadata is missing + + the fields required to advance, or if requesting the next page returns + + a page index that did not advance. + + """ + + response = self.get_turns(page_index=1, page_size=page_size) + + all_turns = self._response_turns(response) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + current_page = self._page_value(pagination, "page_index") or 1 + + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + + total_pages = self._page_value(pagination, "total_pages") + + returned_index = self._page_value(pagination, "page_index") + + if returned_index is None and total_pages is None: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + if total_pages is not None and current_page >= total_pages: + + break + + next_page = current_page + 1 + + response = self.get_turns(page_index=next_page, page_size=page_size) + + all_turns.extend(self._response_turns(response)) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + returned_index = self._page_value(pagination, "page_index") if pagination else None + + if returned_index is not None: + + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + + raise RuntimeError( + + f"get_all_turns pagination did not advance: requested page {next_page}, " + + f"received page {returned_index}." + + ) + + current_page = returned_index + + else: + + total_pages = self._page_value(pagination, "total_pages") if pagination else None + + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + + if total_pages is None and is_last_page is not True: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + current_page = next_page + + return self._with_all_turns(response, all_turns) + + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + @@ -734,7 +877,12 @@ class AsyncAgentSession(_AgentSessionBase): + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + - """Inject a custom text instruction into the current session pipeline.""" + + """Inject a custom text instruction into the current session pipeline. + + + + In API v2.7, omitting ``on_listening_action`` uses the server default + + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + + preserve the pre-v2.7 behavior. + + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + @@ -797,11 +945,64 @@ class AsyncAgentSession(_AgentSessionBase): + self._app_id, self._agent_id, request_options=self._request_options() + ) + + - async def get_turns(self) -> typing.Any: + + async def get_turns( + + self, + + *, + + page_index: typing.Optional[int] = None, + + page_size: typing.Optional[int] = None, + + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_turns( + - self._app_id, self._agent_id, request_options=self._request_options() + + self._app_id, + + self._agent_id, + + page_index=page_index, + + page_size=page_size, + + request_options=self._request_options(), + ) + + + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + + """Get all turn analytics pages for this session. + + + + Raises ``RuntimeError`` if the server's pagination metadata is missing + + the fields required to advance, or if requesting the next page returns + + a page index that did not advance. + + """ + + response = await self.get_turns(page_index=1, page_size=page_size) + + all_turns = self._response_turns(response) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + current_page = self._page_value(pagination, "page_index") or 1 + + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + + total_pages = self._page_value(pagination, "total_pages") + + returned_index = self._page_value(pagination, "page_index") + + if returned_index is None and total_pages is None: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + if total_pages is not None and current_page >= total_pages: + + break + + next_page = current_page + 1 + + response = await self.get_turns(page_index=next_page, page_size=page_size) + + all_turns.extend(self._response_turns(response)) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + returned_index = self._page_value(pagination, "page_index") if pagination else None + + if returned_index is not None: + + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + + raise RuntimeError( + + f"get_all_turns pagination did not advance: requested page {next_page}, " + + f"received page {returned_index}." + + ) + + current_page = returned_index + + else: + + total_pages = self._page_value(pagination, "total_pages") if pagination else None + + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + + if total_pages is None and is_last_page is not True: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + current_page = next_page + + return self._with_all_turns(response, all_turns) + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection + from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode + from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs + from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode + from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from .token import generate_convo_ai_token, _validate_expires_in + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + MllmVendor = StartAgentsRequestPropertiesMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + # LLM sub-type aliases + LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs + LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode + McpServersItem = StartAgentsRequestPropertiesLlmMcpServersItem + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Examples + -------- + >>> from agora_agent.agentkit import Agent + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + ): + self._name = name + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + The failure message is played via TTS when the LLM call fails. + """ + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Returns a new Agent with the specified maximum conversation history length.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + account=agent_uid, + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + if self._stt is not None: + base_kwargs["asr"] = self._stt + + return StartAgentsRequestProperties(**base_kwargs) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + account=self._agent_uid, + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_rtc_avatar(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + account=str(params["agora_uid"]), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + status: unresolved + - id: patch-eaec58eb + content_hash: sha256:8390ced175326080fc76021a97d315e71229bbc9ad70eef35a63eb9968df7830 + original_commit: eaec58eb2edfe03b1311a32dd137a867edf5d096 + original_message: "refactor(agentkit): align deprecated vendor aliases with canonical names" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/vendors/__init__.py + - src/agora_agent/agentkit/vendors/avatar.py + - src/agora_agent/agentkit/vendors/mllm.py + patch_content: |+ + diff --git a/src/agora_agent/agentkit/vendors/__init__.py b/src/agora_agent/agentkit/vendors/__init__.py + index 689eab1..8e2042e 100644 + --- a/src/agora_agent/agentkit/vendors/__init__.py + +++ b/src/agora_agent/agentkit/vendors/__init__.py + @@ -13,7 +13,7 @@ from .base import ( + ) + from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + -from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok, XaiRealtime + +from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok + from .stt import ( + AmazonSTT, + AresSTT, + @@ -83,7 +83,6 @@ __all__ = [ + "GeminiLive", + "VertexAI", + "XaiGrok", + - "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py + index 00cad8f..50bdd08 100644 + --- a/src/agora_agent/agentkit/vendors/avatar.py + +++ b/src/agora_agent/agentkit/vendors/avatar.py + @@ -5,19 +5,19 @@ from pydantic import BaseModel, ConfigDict, Field, field_validator + + from .base import BaseAvatar + + -HEYGEN_SAMPLE_RATE = 24000 + LIVEAVATAR_SAMPLE_RATE = 24000 + +HEYGEN_SAMPLE_RATE = LIVEAVATAR_SAMPLE_RATE + AKOOL_SAMPLE_RATE = 16000 + + + -class HeyGenAvatarOptions(BaseModel): + +class LiveAvatarAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + - api_key: str = Field(..., description="HeyGen API key") + + api_key: str = Field(..., description="LiveAvatar API key") + quality: str = Field(..., description="Avatar quality: low, medium, or high") + agora_uid: str = Field(..., description="Agora UID for the avatar stream") + agora_token: Optional[str] = Field(default=None, description="RTC token for avatar authentication") + - avatar_id: Optional[str] = Field(default=None, description="HeyGen avatar ID") + + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + disable_idle_timeout: Optional[bool] = Field(default=None, description="Whether to disable idle timeout") + activity_idle_timeout: Optional[int] = Field(default=None, description="Idle timeout in seconds") + @@ -31,20 +31,14 @@ class HeyGenAvatarOptions(BaseModel): + raise ValueError(f"Invalid quality '{v}'. Must be one of: {', '.join(valid)}") + return v + + -class HeyGenAvatar(BaseAvatar): + - """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + +class LiveAvatarAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + - warnings.warn( + - "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + - DeprecationWarning, + - stacklevel=2, + - ) + - self.options = HeyGenAvatarOptions(**kwargs) + + self.options = LiveAvatarAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + - return HEYGEN_SAMPLE_RATE + + return LIVEAVATAR_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + @@ -65,71 +59,79 @@ class HeyGenAvatar(BaseAvatar): + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + - return {"enable": enable, "vendor": "heygen", "params": params} + + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + -class AkoolAvatarOptions(BaseModel): + - model_config = ConfigDict(extra="forbid") + +class HeyGenAvatarOptions(LiveAvatarAvatarOptions): + + """Deprecated: use :class:`LiveAvatarAvatarOptions` instead.""" + + - api_key: str = Field(..., description="Akool API key") + - avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + - enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + - additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + -class AkoolAvatar(BaseAvatar): + +class HeyGenAvatar(BaseAvatar): + + """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + + def __init__(self, **kwargs: Any): + - self.options = AkoolAvatarOptions(**kwargs) + + warnings.warn( + + "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + + DeprecationWarning, + + stacklevel=2, + + ) + + self.options = HeyGenAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + - return AKOOL_SAMPLE_RATE + + return HEYGEN_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + + "quality": self.options.quality, + + "agora_uid": self.options.agora_uid, + } + + + if self.options.agora_token is not None: + + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + + if self.options.disable_idle_timeout is not None: + + params["disable_idle_timeout"] = self.options.disable_idle_timeout + + if self.options.activity_idle_timeout is not None: + + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + - return {"enable": enable, "vendor": "akool", "params": params} + + return {"enable": enable, "vendor": "heygen", "params": params} + + + -class LiveAvatarAvatarOptions(HeyGenAvatarOptions): + - pass + +class AkoolAvatarOptions(BaseModel): + + model_config = ConfigDict(extra="forbid") + + + + api_key: str = Field(..., description="Akool API key") + + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + -class LiveAvatarAvatar(BaseAvatar): + +class AkoolAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + - self.options = LiveAvatarAvatarOptions(**kwargs) + + self.options = AkoolAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + - return LIVEAVATAR_SAMPLE_RATE + + return AKOOL_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + - "quality": self.options.quality, + - "agora_uid": self.options.agora_uid, + } + + - if self.options.agora_token is not None: + - params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + - if self.options.disable_idle_timeout is not None: + - params["disable_idle_timeout"] = self.options.disable_idle_timeout + - if self.options.activity_idle_timeout is not None: + - params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + - return {"enable": enable, "vendor": "liveavatar", "params": params} + + return {"enable": enable, "vendor": "akool", "params": params} + + + class GenericAvatarOptions(BaseModel): + @@ -145,6 +147,7 @@ class GenericAvatarOptions(BaseModel): + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + + class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + @@ -178,10 +181,11 @@ class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anam API key") + - persona_id: Optional[str] = Field(default=None, description="Persona ID") + + persona_id: Optional[str] = Field(default=None, description="Anam persona ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + + class AnamAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AnamAvatarOptions(**kwargs) + diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py + index cd6cd07..b58f040 100644 + --- a/src/agora_agent/agentkit/vendors/mllm.py + +++ b/src/agora_agent/agentkit/vendors/mllm.py + @@ -1,4 +1,3 @@ + -import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + @@ -119,30 +118,6 @@ class XaiGrok(BaseMLLM): + return config + + + -class XaiRealtimeOptions(XaiGrokOptions): + - """Deprecated: use :class:`XaiGrokOptions` instead.""" + - + - def __init__(self, **data: Any): + - warnings.warn( + - "XaiRealtimeOptions is deprecated; use XaiGrokOptions instead.", + - DeprecationWarning, + - stacklevel=2, + - ) + - super().__init__(**data) + - + - + -class XaiRealtime(XaiGrok): + - """Deprecated: use :class:`XaiGrok` instead.""" + - + - def __init__(self, **kwargs: Any): + - warnings.warn( + - "XaiRealtime is deprecated; use XaiGrok instead.", + - DeprecationWarning, + - stacklevel=2, + - ) + - super().__init__(**kwargs) + - + - + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + theirs_snapshot: + src/agora_agent/agentkit/vendors/__init__.py: | + from .base import ( + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + ElevenLabsSampleRate, + GoogleTTSSampleRate, + MicrosoftSampleRate, + OpenAISampleRate, + SampleRate, + ) + from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok + from .stt import ( + AmazonSTT, + AresSTT, + AssemblyAISTT, + DeepgramSTT, + GoogleSTT, + MicrosoftSTT, + OpenAISTT, + SarvamSTT, + SpeechmaticsSTT, + ) + from .tts import ( + AmazonTTS, + CartesiaTTS, + DeepgramTTS, + ElevenLabsTTS, + FishAudioTTS, + GoogleTTS, + HumeAITTS, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAITTS, + RimeTTS, + SarvamTTS, + ) + + __all__ = [ + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "GoogleTTSSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + ] + src/agora_agent/agentkit/vendors/avatar.py: | + import warnings + from typing import Any, Dict, Optional + + from pydantic import BaseModel, ConfigDict, Field, field_validator + + from .base import BaseAvatar + + LIVEAVATAR_SAMPLE_RATE = 24000 + HEYGEN_SAMPLE_RATE = LIVEAVATAR_SAMPLE_RATE + AKOOL_SAMPLE_RATE = 16000 + + + class LiveAvatarAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="LiveAvatar API key") + quality: str = Field(..., description="Avatar quality: low, medium, or high") + agora_uid: str = Field(..., description="Agora UID for the avatar stream") + agora_token: Optional[str] = Field(default=None, description="RTC token for avatar authentication") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + disable_idle_timeout: Optional[bool] = Field(default=None, description="Whether to disable idle timeout") + activity_idle_timeout: Optional[int] = Field(default=None, description="Idle timeout in seconds") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + @field_validator("quality") + @classmethod + def validate_quality(cls, v: str) -> str: + valid = ("low", "medium", "high") + if v not in valid: + raise ValueError(f"Invalid quality '{v}'. Must be one of: {', '.join(valid)}") + return v + + + class LiveAvatarAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = LiveAvatarAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return LIVEAVATAR_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + class HeyGenAvatarOptions(LiveAvatarAvatarOptions): + """Deprecated: use :class:`LiveAvatarAvatarOptions` instead.""" + + + class HeyGenAvatar(BaseAvatar): + """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + def __init__(self, **kwargs: Any): + warnings.warn( + "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + DeprecationWarning, + stacklevel=2, + ) + self.options = HeyGenAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return HEYGEN_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "heygen", "params": params} + + + class AkoolAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Akool API key") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + class AkoolAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AkoolAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return AKOOL_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "akool", "params": params} + + + class GenericAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Generic avatar provider API key") + api_base_url: str = Field(..., description="Avatar provider API base URL") + avatar_id: str = Field(..., description="Avatar ID") + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "api_base_url": self.options.api_base_url, + "avatar_id": self.options.avatar_id, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_appid is not None: + params["agora_appid"] = self.options.agora_appid + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.agora_channel is not None: + params["agora_channel"] = self.options.agora_channel + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "generic", "params": params} + + + class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anam API key") + persona_id: Optional[str] = Field(default=None, description="Anam persona ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + class AnamAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AnamAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.persona_id is not None: + params["persona_id"] = self.options.persona_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "anam", "params": params} + src/agora_agent/agentkit/vendors/mllm.py: | + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + + from ...agents.types.start_agents_request_properties_mllm_turn_detection import ( + StartAgentsRequestPropertiesMllmTurnDetection, + ) + from .base import BaseMLLM + + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + + + class OpenAIRealtimeOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="OpenAI API key") + model: Optional[str] = Field(default=None, description="Model name (e.g., gpt-4o-realtime-preview)") + url: Optional[str] = Field(default=None, description="WebSocket URL") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIRealtimeOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config: Dict[str, Any] = { + "vendor": "openai", + "api_key": self.options.api_key, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.model is not None: + params = {"model": self.options.model} + if self.options.params is not None: + params.update(self.options.params) + config["params"] = params + elif self.options.params is not None: + config["params"] = self.options.params + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + # xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + # is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + class XaiGrokOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="xAI API key") + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + class XaiGrok(BaseMLLM): + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + def __init__(self, **kwargs: Any): + self.options = XaiGrokOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.language is not None: + params["language"] = self.options.language + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate + + config: Dict[str, Any] = { + "vendor": "xai", + "api_key": self.options.api_key, + "url": self.options.url, + "params": params, + } + + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + model: str = Field(..., description="Model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud location/region") + adc_credentials_string: str = Field(..., description="Application Default Credentials JSON string") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name (e.g., Aoede, Charon)") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # additional_params spread first so that explicit fields always win, + # matching the TypeScript SDK. + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["model"] = self.options.model + params["project_id"] = self.options.project_id + params["location"] = self.options.location + params["adc_credentials_string"] = self.options.adc_credentials_string + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "vertexai", + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class GeminiLiveOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google API key") + model: str = Field(..., description="Gemini Live model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiLiveOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = {} + if self.options.additional_params is not None: + params.update(self.options.additional_params) + params["model"] = self.options.model + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "gemini", + "api_key": self.options.api_key, + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + status: unresolved + - id: patch-20245632 + content_hash: sha256:a22e4a3b114ba8105c8129ccd6222570dc1f231daf9ac6037a00bcd4e11c425b + original_commit: 20245632afd066efe5a453665b29c5ba0e13e4f8 + original_message: "feat(agentkit): export type aliases and avatar token helpers" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/__init__.py + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + - src/agora_agent/agentkit/avatar_types.py + - src/agora_agent/agentkit/constants.py + patch_content: |+ + From 20245632afd066efe5a453665b29c5ba0e13e4f8 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Thu, 21 May 2026 15:17:27 -0400 + Subject: [PATCH] feat(agentkit): export type aliases and avatar token helpers + + --- + src/agora_agent/agentkit/__init__.py | 49 ++++++++++++++++------- + src/agora_agent/agentkit/agent.py | 22 +++++++++- + src/agora_agent/agentkit/agent_session.py | 8 +++- + src/agora_agent/agentkit/avatar_types.py | 23 +++++++++-- + src/agora_agent/agentkit/constants.py | 10 +++++ + 5 files changed, 90 insertions(+), 22 deletions(-) + + diff --git a/src/agora_agent/agentkit/__init__.py b/src/agora_agent/agentkit/__init__.py + index 5ceda66..e9ab221 100644 + --- a/src/agora_agent/agentkit/__init__.py + +++ b/src/agora_agent/agentkit/__init__.py + @@ -2,6 +2,7 @@ from .agent import ( + Agent, + AgentConfig, + AgentConfigUpdate, + + AsrConfig, + ConversationHistory, + ConversationRole, + ConversationSessionTurn, + @@ -62,23 +63,23 @@ from .agent import ( + SessionListResponse, + SessionSummary, + SpeakPriority, + + ThinkOnListeningAction, + + ThinkOnSpeakingAction, + + ThinkOnThinkingAction, + + ThinkResponse, + ) + -from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + -from ..agent_management.types.agent_think_agent_management_response import ( + - AgentThinkAgentManagementResponse as AgentThinkResponse, + -) + -from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + - AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + -) + -from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + - AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + -) + -from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + - AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + +# Deprecated think type aliases (prefer ThinkOn* names). + +from .agent import ( + + ThinkOnListeningAction as AgentThinkRequestOnListeningAction, + + ThinkOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + + ThinkOnThinkingAction as AgentThinkRequestOnThinkingAction, + + ThinkResponse as AgentThinkResponse, + ) + +from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + @@ -94,6 +95,13 @@ from .constants import ( + GeofenceArea, + GeofenceExcludeArea, + FillerWordsSelectionRule, + + ThinkOnListeningActionIgnore, + + ThinkOnListeningActionInject, + + ThinkOnListeningActionInterrupt, + + ThinkOnSpeakingActionIgnore, + + ThinkOnSpeakingActionInterrupt, + + ThinkOnThinkingActionIgnore, + + ThinkOnThinkingActionInterrupt, + TurnDetectionTypeValues, + ) + from .token import ( + @@ -158,7 +166,6 @@ from .vendors import ( + SpeechmaticsSTT, + VertexAI, + XaiGrok, + - XaiRealtime, + LiveAvatarAvatar, + ) + + @@ -172,6 +179,7 @@ __all__ = [ + "LlmConfig", + "LlmStyle", + "SttConfig", + + "AsrConfig", + "SttVendor", + "TtsConfig", + "MllmConfig", + @@ -230,6 +238,13 @@ __all__ = [ + "GeofenceExcludeArea", + "FillerWordsSelectionRule", + "TurnDetectionTypeValues", + + "ThinkOnListeningActionInject", + + "ThinkOnListeningActionInterrupt", + + "ThinkOnListeningActionIgnore", + + "ThinkOnThinkingActionInterrupt", + + "ThinkOnThinkingActionIgnore", + + "ThinkOnSpeakingActionInterrupt", + + "ThinkOnSpeakingActionIgnore", + # LLM sub-types + "LlmGreetingConfigs", + "LlmGreetingConfigsMode", + @@ -246,10 +261,16 @@ __all__ = [ + "ConversationTurns", + "ConversationSessionTurn", + "SpeakPriority", + + "ThinkResponse", + + "ThinkOnListeningAction", + + "ThinkOnThinkingAction", + + "ThinkOnSpeakingAction", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + "AgentThinkRequestOnSpeakingAction", + + "is_avatar_token_managed", + + "is_rtc_avatar", + "AgentPresets", + "DeepgramPresetModels", + "OpenAIPresetModels", + @@ -303,7 +324,6 @@ __all__ = [ + "GeminiLive", + "VertexAI", + "XaiGrok", + - "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + @@ -314,7 +334,6 @@ __all__ = [ + "is_akool_avatar", + "is_anam_avatar", + "is_generic_avatar", + - "is_rtc_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index 86a958e..14933a2 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -66,13 +66,25 @@ from ..agents.types.start_agents_request_properties_filler_words_content import + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + -from .token import generate_convo_ai_token, _validate_expires_in + +from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + + AgentThinkAgentManagementRequestOnListeningAction, + +) + +from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + + AgentThinkAgentManagementRequestOnThinkingAction, + +) + +from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + + AgentThinkAgentManagementRequestOnSpeakingAction, + +) + +from ..agent_management.types.agent_think_agent_management_response import ( + + AgentThinkAgentManagementResponse, + +) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + +AsrConfig = SttConfig + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + @@ -159,6 +171,14 @@ FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + +# Think type aliases and response + +ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + +ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + +ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + +ThinkResponse = AgentThinkAgentManagementResponse + + + +from .token import generate_convo_ai_token, _validate_expires_in + + + + class Agent: + """A reusable agent definition. + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index e41a399..269619e 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -20,10 +20,10 @@ from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + - is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + @@ -242,7 +242,11 @@ class _AgentSessionBase: + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + - if not is_rtc_avatar(avatar): + + if not is_avatar_token_managed(avatar): + + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + + return + + + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + diff --git a/src/agora_agent/agentkit/avatar_types.py b/src/agora_agent/agentkit/avatar_types.py + index a04809c..aea9da1 100644 + --- a/src/agora_agent/agentkit/avatar_types.py + +++ b/src/agora_agent/agentkit/avatar_types.py + @@ -1,3 +1,4 @@ + +import warnings + import typing + + + @@ -21,11 +22,25 @@ def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "generic" + + + +def is_avatar_token_managed(config: typing.Dict[str, typing.Any]) -> bool: + + """Return True when AgentKit manages the avatar RTC publisher identity.""" + + return ( + + is_heygen_avatar(config) + + or is_live_avatar_avatar(config) + + or is_generic_avatar(config) + + ) + + + + + def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + - params = config.get("params", {}) + - return isinstance(params, dict) and bool(params.get("agora_uid")) and ( + - is_heygen_avatar(config) or is_live_avatar_avatar(config) or is_generic_avatar(config) + + """Deprecated: use :func:`is_avatar_token_managed` for vendor gating.""" + + warnings.warn( + + "is_rtc_avatar is deprecated; use is_avatar_token_managed for vendor gating " + + "and keep agora_uid checks in session enrichment.", + + DeprecationWarning, + + stacklevel=2, + ) + + params = config.get("params", {}) + + return isinstance(params, dict) and bool(params.get("agora_uid")) and is_avatar_token_managed(config) + + + def validate_avatar_config( + @@ -95,7 +110,7 @@ def validate_tts_sample_rate( + """Validates that TTS sample rate is compatible with the avatar vendor. + + Different avatar vendors have specific sample rate requirements: + - - HeyGen: ONLY supports 24,000 Hz + + - HeyGen/LiveAvatar: ONLY supports 24,000 Hz + - Akool: ONLY supports 16,000 Hz + + Parameters + diff --git a/src/agora_agent/agentkit/constants.py b/src/agora_agent/agentkit/constants.py + index f86e4d3..c0a852e 100644 + --- a/src/agora_agent/agentkit/constants.py + +++ b/src/agora_agent/agentkit/constants.py + @@ -58,3 +58,13 @@ class TurnDetectionTypeValues: + AGORA_VAD = "agora_vad" + SERVER_VAD = "server_vad" + SEMANTIC_VAD = "semantic_vad" + + + + + +# Think action value constants (match Fern wire values) + +ThinkOnListeningActionInject = "inject" + +ThinkOnListeningActionInterrupt = "interrupt" + +ThinkOnListeningActionIgnore = "ignore" + +ThinkOnThinkingActionInterrupt = "interrupt" + +ThinkOnThinkingActionIgnore = "ignore" + +ThinkOnSpeakingActionInterrupt = "interrupt" + +ThinkOnSpeakingActionIgnore = "ignore" + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/__init__.py: | + from .agent import ( + Agent, + AgentConfig, + AgentConfigUpdate, + AsrConfig, + ConversationHistory, + ConversationRole, + ConversationSessionTurn, + ConversationTurn, + ConversationTurns, + StartAgentsRequestProperties, + AvatarConfig, + AvatarVendor, + GeofenceConfig, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, + RtcConfig, + SttConfig, + SttVendor, + TtsConfig, + FillerWordsConfig, + FillerWordsTrigger, + FillerWordsTriggerFixedTimeConfig, + FillerWordsContent, + FillerWordsContentStaticConfig, + FillerWordsContentSelectionRule, + TurnDetectionConfig, + TurnDetectionNestedConfig, + StartOfSpeechConfig, + StartOfSpeechMode, + StartOfSpeechVadConfig, + StartOfSpeechKeywordsConfig, + StartOfSpeechDisabledConfig, + StartOfSpeechDisabledConfigStrategy, + EndOfSpeechConfig, + EndOfSpeechMode, + EndOfSpeechVadConfig, + EndOfSpeechSemanticConfig, + TurnDetectionType, + InterruptMode, + Eagerness, + SalConfig, + SalMode, + AdvancedFeatures, + SessionParams, + SessionParamsInput, + SilenceConfig, + SilenceAction, + FarewellConfig, + ParametersDataChannel, + ParametersAudioScenario, + InterruptionConfig, + InterruptionMode, + MllmTurnDetectionConfig, + MllmTurnDetectionMode, + Labels, + LlmGreetingConfigs, + LlmGreetingConfigsMode, + McpServersItem, + SessionInfo, + SessionListResponse, + SessionSummary, + SpeakPriority, + ThinkOnListeningAction, + ThinkOnSpeakingAction, + ThinkOnThinkingAction, + ThinkResponse, + ) + # Deprecated think type aliases (prefer ThinkOn* names). + from .agent import ( + ThinkOnListeningAction as AgentThinkRequestOnListeningAction, + ThinkOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ThinkOnThinkingAction as AgentThinkRequestOnThinkingAction, + ThinkResponse as AgentThinkResponse, + ) + from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .constants import ( + DataChannel, + AudioScenario, + SilenceActionValues, + SalModeValues, + GeofenceArea, + GeofenceExcludeArea, + FillerWordsSelectionRule, + ThinkOnListeningActionIgnore, + ThinkOnListeningActionInject, + ThinkOnListeningActionInterrupt, + ThinkOnSpeakingActionIgnore, + ThinkOnSpeakingActionInterrupt, + ThinkOnThinkingActionIgnore, + ThinkOnThinkingActionInterrupt, + TurnDetectionTypeValues, + ) + from .token import ( + GenerateConvoAITokenOptions, + GenerateTokenOptions, + MAX_EXPIRY_SECONDS, + generate_convo_ai_token, + generate_rtc_token, + expires_in_hours, + expires_in_minutes, + ) + from .presets import ( + AgentPresets, + DeepgramPresetModels, + MiniMaxPresetModels, + OpenAIPresetModels, + OpenAITtsPresetModels, + normalize_preset_input, + ) + from .vendors import ( + AkoolAvatar, + AmazonSTT, + AmazonTTS, + AnamAvatar, + Anthropic, + AresSTT, + AssemblyAISTT, + AzureOpenAI, + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + CartesiaTTS, + DeepgramSTT, + DeepgramTTS, + ElevenLabsSampleRate, + ElevenLabsTTS, + FishAudioTTS, + Gemini, + GeminiLive, + GenericAvatar, + GoogleSTT, + GoogleTTS, + HeyGenAvatar, + HumeAITTS, + MicrosoftSampleRate, + MicrosoftSTT, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAI, + OpenAIRealtime, + OpenAISampleRate, + OpenAISTT, + OpenAITTS, + RimeTTS, + SampleRate, + SarvamSTT, + SarvamTTS, + SpeechmaticsSTT, + VertexAI, + XaiGrok, + LiveAvatarAvatar, + ) + + __all__ = [ + "Agent", + "AgentConfig", + "AgentConfigUpdate", + # Return type of Agent.to_properties() + "StartAgentsRequestProperties", + # Top-level config types + "LlmConfig", + "LlmStyle", + "SttConfig", + "AsrConfig", + "SttVendor", + "TtsConfig", + "MllmConfig", + "MllmVendor", + "AvatarConfig", + "AvatarVendor", + "GeofenceConfig", + "RtcConfig", + "FillerWordsConfig", + "FillerWordsTrigger", + "FillerWordsTriggerFixedTimeConfig", + "FillerWordsContent", + "FillerWordsContentStaticConfig", + "FillerWordsContentSelectionRule", + # Turn detection types + "TurnDetectionConfig", + "TurnDetectionNestedConfig", + "StartOfSpeechConfig", + "StartOfSpeechMode", + "StartOfSpeechVadConfig", + "StartOfSpeechKeywordsConfig", + "StartOfSpeechDisabledConfig", + "StartOfSpeechDisabledConfigStrategy", + "EndOfSpeechConfig", + "EndOfSpeechMode", + "EndOfSpeechVadConfig", + "EndOfSpeechSemanticConfig", + # Deprecated turn detection types + "TurnDetectionType", + "InterruptMode", + "Eagerness", + # SAL types + "SalConfig", + "SalMode", + # Advanced features + "AdvancedFeatures", + # Session parameters types + "SessionParams", + "SessionParamsInput", + "SilenceConfig", + "SilenceAction", + "FarewellConfig", + "ParametersDataChannel", + "ParametersAudioScenario", + "InterruptionConfig", + "InterruptionMode", + "MllmTurnDetectionConfig", + "MllmTurnDetectionMode", + "Labels", + # Type-safe constants + "DataChannel", + "AudioScenario", + "SilenceActionValues", + "SalModeValues", + "GeofenceArea", + "GeofenceExcludeArea", + "FillerWordsSelectionRule", + "TurnDetectionTypeValues", + "ThinkOnListeningActionInject", + "ThinkOnListeningActionInterrupt", + "ThinkOnListeningActionIgnore", + "ThinkOnThinkingActionInterrupt", + "ThinkOnThinkingActionIgnore", + "ThinkOnSpeakingActionInterrupt", + "ThinkOnSpeakingActionIgnore", + # LLM sub-types + "LlmGreetingConfigs", + "LlmGreetingConfigsMode", + "McpServersItem", + "AgentSession", + "AsyncAgentSession", + "AgentSessionOptions", + "SessionInfo", + "SessionListResponse", + "SessionSummary", + "ConversationHistory", + "ConversationTurn", + "ConversationRole", + "ConversationTurns", + "ConversationSessionTurn", + "SpeakPriority", + "ThinkResponse", + "ThinkOnListeningAction", + "ThinkOnThinkingAction", + "ThinkOnSpeakingAction", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + "AgentThinkRequestOnSpeakingAction", + "is_avatar_token_managed", + "is_rtc_avatar", + "AgentPresets", + "DeepgramPresetModels", + "OpenAIPresetModels", + "OpenAITtsPresetModels", + "MiniMaxPresetModels", + "normalize_preset_input", + "generate_rtc_token", + "GenerateTokenOptions", + "generate_convo_ai_token", + "GenerateConvoAITokenOptions", + "MAX_EXPIRY_SECONDS", + "expires_in_hours", + "expires_in_minutes", + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + "is_heygen_avatar", + "is_live_avatar_avatar", + "is_akool_avatar", + "is_anam_avatar", + "is_generic_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection + from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode + from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs + from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode + from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse, + ) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + AsrConfig = SttConfig + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + MllmVendor = StartAgentsRequestPropertiesMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + # LLM sub-type aliases + LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs + LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode + McpServersItem = StartAgentsRequestPropertiesLlmMcpServersItem + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + # Think type aliases and response + ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + ThinkResponse = AgentThinkAgentManagementResponse + + from .token import generate_convo_ai_token, _validate_expires_in + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Examples + -------- + >>> from agora_agent.agentkit import Agent + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + ): + self._name = name + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + The failure message is played via TTS when the LLM call fails. + """ + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Returns a new Agent with the specified maximum conversation history length.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + account=agent_uid, + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + if self._stt is not None: + base_kwargs["asr"] = self._stt + + return StartAgentsRequestProperties(**base_kwargs) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + account=self._agent_uid, + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + account=str(params["agora_uid"]), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + src/agora_agent/agentkit/avatar_types.py: | + import warnings + import typing + + + def is_heygen_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "heygen" + + + def is_live_avatar_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "liveavatar" + + + def is_akool_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "akool" + + + def is_anam_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "anam" + + + def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "generic" + + + def is_avatar_token_managed(config: typing.Dict[str, typing.Any]) -> bool: + """Return True when AgentKit manages the avatar RTC publisher identity.""" + return ( + is_heygen_avatar(config) + or is_live_avatar_avatar(config) + or is_generic_avatar(config) + ) + + + def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + """Deprecated: use :func:`is_avatar_token_managed` for vendor gating.""" + warnings.warn( + "is_rtc_avatar is deprecated; use is_avatar_token_managed for vendor gating " + "and keep agora_uid checks in session enrichment.", + DeprecationWarning, + stacklevel=2, + ) + params = config.get("params", {}) + return isinstance(params, dict) and bool(params.get("agora_uid")) and is_avatar_token_managed(config) + + + def validate_avatar_config( + config: typing.Dict[str, typing.Any], + require_session_fields: bool = False, + ) -> None: + """Validates avatar configuration at runtime. + + Parameters + ---------- + config : dict + The avatar configuration dictionary. + + Raises + ------ + ValueError + If the configuration is invalid. + """ + if is_heygen_avatar(config) or is_live_avatar_avatar(config): + label = "HeyGen" if is_heygen_avatar(config) else "LiveAvatar" + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError(f"{label} avatar requires api_key") + if not params.get("quality"): + raise ValueError(f"{label} avatar requires quality (low, medium, or high)") + if not params.get("agora_uid"): + raise ValueError(f"{label} avatar requires agora_uid") + valid_qualities = ("low", "medium", "high") + if params.get("quality") not in valid_qualities: + raise ValueError( + f"Invalid quality for {label}: {params.get('quality')}. " + f"Must be one of: {', '.join(valid_qualities)}" + ) + if require_session_fields and not params.get("agora_token"): + raise ValueError(f"{label} avatar requires agora_token after session enrichment") + elif is_akool_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Akool avatar requires api_key") + elif is_anam_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Anam avatar requires api_key") + elif is_generic_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Generic avatar requires api_key") + if not params.get("api_base_url"): + raise ValueError("Generic avatar requires api_base_url") + if not params.get("avatar_id"): + raise ValueError("Generic avatar requires avatar_id") + if not params.get("agora_uid"): + raise ValueError("Generic avatar requires agora_uid") + if require_session_fields: + if not params.get("agora_token"): + raise ValueError("Generic avatar requires agora_token after session enrichment") + if not params.get("agora_appid"): + raise ValueError("Generic avatar requires agora_appid after session enrichment") + if not params.get("agora_channel"): + raise ValueError("Generic avatar requires agora_channel after session enrichment") + + + def validate_tts_sample_rate( + avatar_config: typing.Dict[str, typing.Any], + tts_sample_rate: int, + ) -> None: + """Validates that TTS sample rate is compatible with the avatar vendor. + + Different avatar vendors have specific sample rate requirements: + - HeyGen/LiveAvatar: ONLY supports 24,000 Hz + - Akool: ONLY supports 16,000 Hz + + Parameters + ---------- + avatar_config : dict + The avatar configuration dictionary. + tts_sample_rate : int + The sample rate from your TTS configuration (in Hz). + + Raises + ------ + ValueError + If TTS sample rate is incompatible with the avatar vendor. + """ + if is_heygen_avatar(avatar_config) or is_live_avatar_avatar(avatar_config): + if tts_sample_rate != 24000: + label = "HeyGen" if is_heygen_avatar(avatar_config) else "LiveAvatar" + raise ValueError( + f"{label} avatars ONLY support 24,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 24kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/overview" + ) + elif is_akool_avatar(avatar_config): + if tts_sample_rate != 16000: + raise ValueError( + f"Akool avatars ONLY support 16,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 16kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/akool" + ) + src/agora_agent/agentkit/constants.py: | + """ + Type-safe constants for agent configuration values. + Use these instead of raw strings to avoid typos and get IDE autocomplete. + """ + + # Data channel: "rtm" | "datastream" + class DataChannel: + RTM = "rtm" + DATASTREAM = "datastream" + + class AudioScenario: + DEFAULT = "default" + CHORUS = "chorus" + AISERVER = "aiserver" + + + # Silence action when timeout elapses: "speak" | "think" + # (Use for parameters.silence_config.action — avoids shadowing SilenceAction type) + class SilenceActionValues: + SPEAK = "speak" + THINK = "think" + + + # SAL mode: "locking" | "recognition" + # (Use for sal.sal_mode — avoids shadowing SalMode type) + class SalModeValues: + LOCKING = "locking" + RECOGNITION = "recognition" + + + # Geofence area: "GLOBAL" | "NORTH_AMERICA" | "EUROPE" | "ASIA" | "INDIA" | "JAPAN" + class GeofenceArea: + GLOBAL = "GLOBAL" + NORTH_AMERICA = "NORTH_AMERICA" + EUROPE = "EUROPE" + ASIA = "ASIA" + INDIA = "INDIA" + JAPAN = "JAPAN" + + + # Geofence exclude area (when area is GLOBAL) + class GeofenceExcludeArea: + NORTH_AMERICA = "NORTH_AMERICA" + EUROPE = "EUROPE" + ASIA = "ASIA" + INDIA = "INDIA" + JAPAN = "JAPAN" + + + # Filler word selection rule: "shuffle" | "round_robin" + class FillerWordsSelectionRule: + SHUFFLE = "shuffle" + ROUND_ROBIN = "round_robin" + + + # Turn detection type (deprecated; use TurnDetectionNestedConfig.EndOfSpeech instead) + class TurnDetectionTypeValues: + AGORA_VAD = "agora_vad" + SERVER_VAD = "server_vad" + SEMANTIC_VAD = "semantic_vad" + + + # Think action value constants (match Fern wire values) + ThinkOnListeningActionInject = "inject" + ThinkOnListeningActionInterrupt = "interrupt" + ThinkOnListeningActionIgnore = "ignore" + ThinkOnThinkingActionInterrupt = "interrupt" + ThinkOnThinkingActionIgnore = "ignore" + ThinkOnSpeakingActionInterrupt = "interrupt" + ThinkOnSpeakingActionIgnore = "ignore" + status: unresolved + - id: patch-972dd5bd + content_hash: sha256:10f86db20e0b5a3800efce4913b736ff338dee29eb18cb31e89658e0293b848e + original_commit: 972dd5bdafc09b3981ab2ce4e0d02beae165c626 + original_message: updated docs + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/reference/agent.md + patch_content: |+ + From 972dd5bdafc09b3981ab2ce4e0d02beae165c626 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Thu, 21 May 2026 16:13:35 -0400 + Subject: [PATCH] updated docs + + --- + docs/reference/agent.md | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + + diff --git a/docs/reference/agent.md b/docs/reference/agent.md + index 1e88b8b..3163f9c 100644 + --- a/docs/reference/agent.md + +++ b/docs/reference/agent.md + @@ -264,3 +264,18 @@ to_properties( + | `rtc` | `Optional[RtcConfig]` | RTC configuration | + | `filler_words` | `Optional[FillerWordsConfig]` | Filler words configuration | + | `config` | `Dict[str, Any]` | Full configuration dict | + + + +## Type aliases + + + +Public aliases over Fern-generated types: `LlmConfig`, `SttConfig`, `AsrConfig` (= `SttConfig`), `MllmConfig`, `AvatarConfig`, session/conversation types, and think types (`ThinkOnListeningAction`, etc.). + + + +Think value constants: `ThinkOnListeningActionInject`, `ThinkOnListeningActionInterrupt`, `ThinkOnListeningActionIgnore`, `ThinkOnThinkingActionInterrupt`, `ThinkOnThinkingActionIgnore`, `ThinkOnSpeakingActionInterrupt`, `ThinkOnSpeakingActionIgnore`. + + + +## Cross-SDK discovery map + + + +| Concept | Python | TypeScript | Go | + +|---|---|---|---| + +| STT payload alias (wire: `asr`) | `SttConfig` / `AsrConfig` | `SttConfig` / `AsrConfig` | `AsrConfig` / `SttConfig` | + +| xAI MLLM (primary) | `XaiGrok` | `XaiGrok` | `XaiGrok` / `NewXaiGrok` | + +| Avatar token helper | `is_avatar_token_managed` | `isAvatarTokenManaged` | `IsAvatarTokenManaged` | + +| Think inject constant | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | + -- + 2.52.0 + + theirs_snapshot: + docs/reference/agent.md: | + --- + sidebar_position: 2 + title: Agent + description: Full API reference for the Python Agent builder class. + --- + + # Agent Reference + + **Import:** `from agora_agent.agentkit import Agent` or `from agora_agent import Agent` + + ## Constructor + + + ```python + Agent( + name: Optional[str] = None, + instructions: Optional[str] = None, + turn_detection: Optional[TurnDetectionConfig] = None, + interruption: Optional[InterruptionConfig] = None, + sal: Optional[SalConfig] = None, + advanced_features: Optional[Dict[str, Any]] = None, + parameters: Optional[SessionParams] = None, + greeting: Optional[str] = None, + failure_message: Optional[str] = None, + max_history: Optional[int] = None, + geofence: Optional[GeofenceConfig] = None, + labels: Optional[Dict[str, str]] = None, + rtc: Optional[RtcConfig] = None, + filler_words: Optional[FillerWordsConfig] = None, + ) + ``` + + | Parameter | Type | Default | Description | + |---|---|---|---| + | `name` | `Optional[str]` | `None` | Agent name, used as default session name | + | `instructions` | `Optional[str]` | `None` | System prompt for the LLM | + | `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Turn detection configuration | + | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | + | `sal` | `Optional[SalConfig]` | `None` | Speech Activity Level configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | `None` | Advanced features dict (e.g., `{'enable_rtm': True}`) | + | `parameters` | `Optional[SessionParams]` | `None` | Additional session parameters | + | `greeting` | `Optional[str]` | `None` | Auto-spoken greeting when agent joins | + | `failure_message` | `Optional[str]` | `None` | Spoken on error | + | `max_history` | `Optional[int]` | `None` | Max conversation history length | + | `geofence` | `Optional[GeofenceConfig]` | `None` | Regional access restriction | + | `labels` | `Optional[Dict[str, str]]` | `None` | Custom key-value labels (returned in callbacks) | + | `rtc` | `Optional[RtcConfig]` | `None` | RTC media encryption | + | `filler_words` | `Optional[FillerWordsConfig]` | `None` | Filler words while waiting for LLM | + + ## Builder Methods + + All builder methods return a new `Agent` instance (immutable pattern). + + ### `with_llm(vendor: BaseLLM) -> Agent` + + Set the LLM vendor for cascading flow. + + + ```python + from agora_agent.agentkit.vendors import OpenAI + agent = Agent().with_llm(OpenAI(api_key='your-key', model='gpt-4o-mini')) + ``` + + ### `with_tts(vendor: BaseTTS) -> Agent` + + Set the TTS vendor. Records the vendor's `sample_rate` for avatar validation. + + + ```python + from agora_agent.agentkit.vendors import ElevenLabsTTS + agent = Agent().with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id')) + ``` + + ### `with_stt(vendor: BaseSTT) -> Agent` + + Set the STT (ASR) vendor. + + + ```python + from agora_agent.agentkit.vendors import DeepgramSTT + agent = Agent().with_stt(DeepgramSTT(api_key='your-key', language='en-US')) + ``` + + ### `with_mllm(vendor: BaseMLLM) -> Agent` + + Set the MLLM vendor for multimodal flow. Calling `with_mllm()` automatically sets `mllm.enable = True`. MLLM sessions do not require TTS, STT, or LLM vendors. + + + ```python + from agora_agent.agentkit.vendors import OpenAIRealtime + agent = Agent().with_mllm(OpenAIRealtime(api_key='your-key')) + ``` + + ### `with_avatar(vendor: BaseAvatar) -> Agent` + + Set the avatar vendor for the cascading ASR + LLM + TTS pipeline. Avatars are not supported when MLLM is enabled — combining `with_mllm()` and an enabled `with_avatar()` is rejected at `to_properties()` and `AgentSession.start()`. A disabled avatar (`enable=False`) is allowed alongside MLLM. + + Raises `ValueError` if the TTS sample rate does not match the avatar's `required_sample_rate`. + + + ```python + from agora_agent.agentkit.vendors import HeyGenAvatar + agent = agent.with_avatar(HeyGenAvatar(api_key='your-key', quality='medium', agora_uid='2')) + ``` + + **Raises:** `ValueError` — `"Avatar requires TTS sample rate of {required} Hz, but TTS is configured with {actual} Hz. Please update your TTS sample_rate to {required}."` + + ### `with_turn_detection(config: TurnDetectionConfig) -> Agent` + + Override cascading-flow turn detection settings. Use `config.start_of_speech` and `config.end_of_speech` for SOS/EOS detection. Use `with_interruption()` for interruption behavior and MLLM vendor `turn_detection` for MLLM turn detection. + + Pause-state detection is configured under semantic end-of-speech: + + ```python + agent = agent.with_turn_detection({ + "mode": "default", + "config": { + "end_of_speech": { + "mode": "semantic", + "semantic_config": { + "pause_state_enabled": True, + }, + }, + }, + }) + ``` + + ### `with_interruption(config: InterruptionConfig) -> Agent` + + Configure unified interruption behavior using the top-level `interruption` object. Use this for `start_of_speech` and `keywords` interruption modes. + + ### `with_instructions(instructions: str) -> Agent` + + Override the system prompt. + + ### `with_greeting(greeting: str) -> Agent` + + Override the greeting message. + + ### `with_name(name: str) -> Agent` + + Override the agent name. + + ### `with_sal(config: SalConfig) -> Agent` + + Set SAL (Selective Attention Locking) configuration. + + ### `with_advanced_features(features: AdvancedFeatures) -> Agent` + + Set advanced features (e.g. `{'enable_rtm': True}`). + + When `enable_rtm=True`, AgentKit defaults `parameters.data_channel` to `"rtm"` unless you explicitly set another data channel. + + ### `with_tools(enabled: bool = True) -> Agent` + + Enable or disable MCP tool invocation by setting `advanced_features.enable_tools`. + + ### `with_parameters(parameters: SessionParams) -> Agent` + + Set session parameters (silence config, farewell config, data channel, audio scenario, etc.). + + ### `with_audio_scenario(audio_scenario: ParametersAudioScenario) -> Agent` + + Set `parameters.audio_scenario` without replacing existing session parameters. + + ### `with_failure_message(message: str) -> Agent` + + Set the message spoken via TTS when the LLM call fails. + + ### `with_max_history(max_history: int) -> Agent` + + Set the maximum conversation history length for the standard ASR + LLM + TTS pipeline. The v2.7 MLLM core type does not expose `max_history`. + + ### `with_geofence(geofence: GeofenceConfig) -> Agent` + + Set geofence configuration (restricts backend server regions). + + ### `with_labels(labels: Dict[str, str]) -> Agent` + + Set custom labels (key-value pairs returned in notification callbacks). + + ### `with_rtc(rtc: RtcConfig) -> Agent` + + Set RTC configuration. + + ### `with_filler_words(filler_words: FillerWordsConfig) -> Agent` + + Set filler words configuration (played while waiting for LLM response). + + ## `create_session()` + + + ```python + create_session( + client: Any, + channel: str, + agent_uid: str, + remote_uids: List[str], + name: Optional[str] = None, + token: Optional[str] = None, + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + expires_in: Optional[int] = None, + ) -> AgentSession + ``` + + Creates an `AgentSession` bound to the given client and channel. + + | Parameter | Type | Required | Description | + |---|---|---|---| + | `client` | `Agora` or `AsyncAgora` | Yes | Authenticated client | + | `channel` | `str` | Yes | Channel name | + | `agent_uid` | `str` | Yes | UID for the agent | + | `remote_uids` | `List[str]` | Yes | UIDs of remote participants | + | `name` | `Optional[str]` | No | Session name (defaults to agent name) | + | `token` | `Optional[str]` | No | Pre-built RTC+RTM token | + | `expires_in` | `Optional[int]` | No | Token lifetime in seconds (default: `86400` = 24 h, Agora max). Only applies when the token is auto-generated. Use `expires_in_hours()` or `expires_in_minutes()` for clarity. Valid range: 1–86400. | + | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | + | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | + + **Returns:** `AgentSession` + + ## `to_properties()` + + Converts the agent configuration into a `StartAgentsRequestProperties` object for the Agora API. Called internally by `AgentSession.start()`. + + + ```python + to_properties( + channel: str, + agent_uid: str, + remote_uids: List[str], + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + token: Optional[str] = None, + app_id: Optional[str] = None, + app_certificate: Optional[str] = None, + expires_in: Optional[int] = None, + ) -> StartAgentsRequestProperties + ``` + + **Raises:** `ValueError` if neither `token` nor `app_id`+`app_certificate` is provided, or if required vendors (LLM, TTS) are missing in cascading mode. + + ## Properties + + | Property | Type | Description | + |---|---|---| + | `name` | `Optional[str]` | Agent name | + | `instructions` | `Optional[str]` | System prompt | + | `greeting` | `Optional[str]` | Greeting message | + | `failure_message` | `Optional[str]` | Message spoken when LLM fails | + | `max_history` | `Optional[int]` | Max conversation history length | + | `llm` | `Optional[Dict[str, Any]]` | LLM config dict (from `to_config()`) | + | `tts` | `Optional[Dict[str, Any]]` | TTS config dict | + | `stt` | `Optional[Dict[str, Any]]` | STT config dict | + | `mllm` | `Optional[Dict[str, Any]]` | MLLM config dict | + | `avatar` | `Optional[Dict[str, Any]]` | Avatar config dict | + | `turn_detection` | `Optional[TurnDetectionConfig]` | Turn detection settings | + | `sal` | `Optional[SalConfig]` | SAL configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | Advanced features | + | `parameters` | `Optional[SessionParams]` | Session parameters | + | `geofence` | `Optional[GeofenceConfig]` | Geofence configuration | + | `labels` | `Optional[Dict[str, str]]` | Custom labels | + | `rtc` | `Optional[RtcConfig]` | RTC configuration | + | `filler_words` | `Optional[FillerWordsConfig]` | Filler words configuration | + | `config` | `Dict[str, Any]` | Full configuration dict | + + ## Type aliases + + Public aliases over Fern-generated types: `LlmConfig`, `SttConfig`, `AsrConfig` (= `SttConfig`), `MllmConfig`, `AvatarConfig`, session/conversation types, and think types (`ThinkOnListeningAction`, etc.). + + Think value constants: `ThinkOnListeningActionInject`, `ThinkOnListeningActionInterrupt`, `ThinkOnListeningActionIgnore`, `ThinkOnThinkingActionInterrupt`, `ThinkOnThinkingActionIgnore`, `ThinkOnSpeakingActionInterrupt`, `ThinkOnSpeakingActionIgnore`. + + ## Cross-SDK discovery map + + | Concept | Python | TypeScript | Go | + |---|---|---|---| + | STT payload alias (wire: `asr`) | `SttConfig` / `AsrConfig` | `SttConfig` / `AsrConfig` | `AsrConfig` / `SttConfig` | + | xAI MLLM (primary) | `XaiGrok` | `XaiGrok` | `XaiGrok` / `NewXaiGrok` | + | Avatar token helper | `is_avatar_token_managed` | `isAvatarTokenManaged` | `IsAvatarTokenManaged` | + | Think inject constant | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | + status: unresolved + - id: patch-7465fada + content_hash: sha256:9c6ed2e5f48702293eed8b213cc31cce63a7ed5a1ad16a0b23e791c13e77746f + original_commit: 7465fadafa0f1e62051d99b42d0eeda85f31eeee + original_message: "fix(agentkit): resolve Python session typing issues" + original_author: digitallysavvy + base_generation: 477f40aa3df8b8d586a647685cf0e9686d7ce16f + files: + - src/agora_agent/agentkit/agent_session.py + patch_content: | + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index a749d1e..ddcd930 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -24,6 +24,7 @@ from .avatar_types import ( + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + theirs_snapshot: + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent, GetTurnsOptions, SayOptions, ThinkOptions + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token, _parse_numeric_uid + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(self._agent_uid, "agent_uid"), + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(str(params["agora_uid"]), "avatar agora_uid"), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.greeting_configs is not None: + llm["greeting_configs"] = self._dump_model(self._agent.greeting_configs) + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @staticmethod + def _response_pagination(response: typing.Any) -> typing.Any: + if isinstance(response, dict): + return response.get("pagination") + return getattr(response, "pagination", None) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + - id: patch-d29165c4 + content_hash: sha256:be59d1d3efc435d5e0b83305b2cd39ce3dad4534a4125de18028c137e692e659 + original_commit: d29165c4ddd8296af703a4e9ed848516f563dd1b + original_message: make python compat package publishable + original_author: chenyuguo + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - compat/agora-agent-server-sdk/README.md + - compat/agora-agent-server-sdk/pyproject.toml + - compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + patch_content: |+ + From d29165c4ddd8296af703a4e9ed848516f563dd1b Mon Sep 17 00:00:00 2001 + From: chenyuguo + Date: Wed, 27 May 2026 17:24:50 +0800 + Subject: [PATCH] make python compat package publishable + + --- + compat/agora-agent-server-sdk/README.md | 2 ++ + compat/agora-agent-server-sdk/pyproject.toml | 3 +++ + .../src/agora_agent_server_sdk_compat/__init__.py | 1 + + 3 files changed, 6 insertions(+) + create mode 100644 compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + + diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md + index 1388836..cff3cfe 100644 + --- a/compat/agora-agent-server-sdk/README.md + +++ b/compat/agora-agent-server-sdk/README.md + @@ -9,3 +9,5 @@ pip install agora-agents + ``` + + This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. + + + +It intentionally contains only a minimal compatibility module so the distribution can be built and published cleanly with Poetry. + diff --git a/compat/agora-agent-server-sdk/pyproject.toml b/compat/agora-agent-server-sdk/pyproject.toml + index 8efbe53..ac93128 100644 + --- a/compat/agora-agent-server-sdk/pyproject.toml + +++ b/compat/agora-agent-server-sdk/pyproject.toml + @@ -26,6 +26,9 @@ classifiers = [ + "Topic :: Software Development :: Libraries :: Python Modules", + "Typing :: Typed" + ] + +packages = [ + + { include = "agora_agent_server_sdk_compat", from = "src"} + +] + + [tool.poetry.urls] + Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python' + diff --git a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + new file mode 100644 + index 0000000..55522c6 + --- /dev/null + +++ b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + @@ -0,0 +1 @@ + +"""Compatibility package for the renamed agora-agents distribution.""" + -- + 2.52.0 + + theirs_snapshot: + compat/agora-agent-server-sdk/README.md: | + # agora-agent-server-sdk + + This package has been renamed to `agora-agents`. + + New projects should install: + + ```sh + pip install agora-agents + ``` + + This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. + + It intentionally contains only a minimal compatibility module so the distribution can be built and published cleanly with Poetry. + compat/agora-agent-server-sdk/pyproject.toml: | + [project] + name = "agora-agent-server-sdk" + + [tool.poetry] + name = "agora-agent-server-sdk" + version = "v2.0.0" + description = "Compatibility shim for the renamed agora-agents package." + readme = "README.md" + authors = [] + keywords = [] + + classifiers = [ + "Intended Audience :: Developers", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Operating System :: OS Independent", + "Operating System :: POSIX", + "Operating System :: MacOS", + "Operating System :: POSIX :: Linux", + "Operating System :: Microsoft :: Windows", + "Topic :: Software Development :: Libraries :: Python Modules", + "Typing :: Typed" + ] + packages = [ + { include = "agora_agent_server_sdk_compat", from = "src"} + ] + + [tool.poetry.urls] + Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python' + + [tool.poetry.dependencies] + python = "^3.8" + agora-agents = ">=2.0.0,<3.0.0" + + [build-system] + requires = ["poetry-core"] + build-backend = "poetry.core.masonry.api" + compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py: | + """Compatibility package for the renamed agora-agents distribution.""" + status: unresolved + - id: patch-fae1249a + content_hash: sha256:01bf21f3cc4c784dfcff80a48c9c7bb3123af4327a567b7c990b528e9780e9a2 + original_commit: fae1249a20c53761a2eb5515a1bf92ca666760d1 + original_message: Re-export agora-agents API from legacy PyPI compatibility package The compat distribution delegates to agora_agent via __getattr__ and documents both import paths in its README. + original_author: digitallysavvy + base_generation: 477f40aa3df8b8d586a647685cf0e9686d7ce16f + files: + - compat/agora-agent-server-sdk/README.md + - compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + patch_content: |+ + From fae1249a20c53761a2eb5515a1bf92ca666760d1 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 27 May 2026 16:58:18 -0400 + Subject: [PATCH] Re-export agora-agents API from legacy PyPI compatibility + package The compat distribution delegates to agora_agent via __getattr__ and + documents both import paths in its README. + + --- + compat/agora-agent-server-sdk/README.md | 7 +++++-- + .../src/agora_agent_server_sdk_compat/__init__.py | 14 +++++++++++++- + 2 files changed, 18 insertions(+), 3 deletions(-) + + diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md + index cff3cfe..e43d1d8 100644 + --- a/compat/agora-agent-server-sdk/README.md + +++ b/compat/agora-agent-server-sdk/README.md + @@ -8,6 +8,9 @@ New projects should install: + pip install agora-agents + ``` + + -This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. + +This compatibility package re-exports the public API from `agora-agents` to support existing installs during the migration window. The primary import path remains `agora_agent`; you can also import from `agora_agent_server_sdk_compat`: + + -It intentionally contains only a minimal compatibility module so the distribution can be built and published cleanly with Poetry. + +```python + +from agora_agent import Agora, Area + +from agora_agent_server_sdk_compat import Agora, Area + +``` + diff --git a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + index 55522c6..6283244 100644 + --- a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + +++ b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + @@ -1 +1,13 @@ + -"""Compatibility package for the renamed agora-agents distribution.""" + +"""Compatibility re-exports for the renamed agora-agents package.""" + + + +import agora_agent as _agora_agent + + + +__all__ = getattr(_agora_agent, "__all__", []) + + + + + +def __getattr__(name: str): + + return getattr(_agora_agent, name) + + + + + +def __dir__(): + + return dir(_agora_agent) + -- + 2.52.0 + + theirs_snapshot: + compat/agora-agent-server-sdk/README.md: | + # agora-agent-server-sdk + + This package has been renamed to `agora-agents`. + + New projects should install: + + ```sh + pip install agora-agents + ``` + + This compatibility package re-exports the public API from `agora-agents` to support existing installs during the migration window. The primary import path remains `agora_agent`; you can also import from `agora_agent_server_sdk_compat`: + + ```python + from agora_agent import Agora, Area + from agora_agent_server_sdk_compat import Agora, Area + ``` + compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py: | + """Compatibility re-exports for the renamed agora-agents package.""" + + import agora_agent as _agora_agent + + __all__ = getattr(_agora_agent, "__all__", []) + + + def __getattr__(name: str): + return getattr(_agora_agent, name) + + + def __dir__(): + return dir(_agora_agent) + user_owned: true + - id: patch-fc9d93c3 + content_hash: sha256:93877741bdad745fda5dd549d7c3dd6bc315f4574aabd2defb52c0c795bff011 + original_commit: fc9d93c3026a6109d8a5e8b386418592f8d121c5 + original_message: Document agora-agents PyPI install name and migration notes + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/getting-started/installation.md + patch_content: | + diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md + index c14bdb2..f6f1750 100644 + --- a/docs/getting-started/installation.md + +++ b/docs/getting-started/installation.md + @@ -13,13 +13,13 @@ description: Install the Agora Conversational AI Python SDK. + ## Install with pip + + ```sh + -pip install agora-agent-sdk + +pip install agora-agents + ``` + + ## Install with Poetry + + ```sh + -poetry add agora-agent-sdk + +poetry add agora-agents + ``` + + ## Dependencies + theirs_snapshot: + docs/getting-started/installation.md: | + --- + sidebar_position: 1 + title: Installation + description: Install the Agora Conversational AI Python SDK. + --- + + # Installation + + ## Prerequisites + + - Python >= 3.8 + + ## Install with pip + + ```sh + pip install agora-agents + ``` + + ## Install with Poetry + + ```sh + poetry add agora-agents + ``` + + ## Dependencies + + The following packages are installed automatically: + + | Package | Purpose | + |---|---| + | `httpx` (>= 0.21.2) | HTTP client for sync and async requests | + | `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | + | `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | + + ## Sync vs. Async + + The SDK supports both synchronous and asynchronous usage: + + - **Synchronous** — import `Agora` from `agora_agent` and use blocking method calls + - **Asynchronous** — import `AsyncAgora` and `AsyncAgentSession` from `agora_agent` and use `await` with all API calls + + ```python + # Sync + from agora_agent import Agora, Area + + # Async + from agora_agent import AsyncAgora, AsyncAgentSession, Area + ``` + + Both clients share the same constructor parameters and capabilities. See [Authentication](./authentication.md) for setup details. + status: unresolved + - id: patch-44c21c14 + content_hash: sha256:34f08060a06ca824943ab02e75c3c83ad43a1b6e7d09ec6f8fa244ef82de6fcd + original_commit: 44c21c14a14aa7ad469a18ce86024ff14ca2bf9b + original_message: Re-export AgentKit symbols from agora_agent package root Extend __getattr__ and __all__ so vendor classes, presets, and helpers are importable via `from agora_agent import ...`. Add tests and update class docstring examples to use the root import path. + original_author: digitallysavvy + base_generation: 477f40aa3df8b8d586a647685cf0e9686d7ce16f + files: + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + - tests/custom/test_root_exports.py + patch_content: | + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index f84862c..0d7a4aa 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -67,6 +67,8 @@ from ..agents.types.start_agents_request_properties_filler_words_content import + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + +from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + +from ..types.tts import Tts + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index a749d1e..ddcd930 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -24,6 +24,7 @@ from .avatar_types import ( + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + theirs_snapshot: + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.list_agents_response_data_list_item_status import ListAgentsResponseDataListItemStatus + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection + from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode + from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs + from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode + from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse, + ) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + AsrConfig = SttConfig + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + MllmVendor = StartAgentsRequestPropertiesMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + SessionStatus = ListAgentsResponseDataListItemStatus + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + + class ThinkOptions(typing_extensions.TypedDict, total=False): + on_listening_action: AgentThinkAgentManagementRequestOnListeningAction + on_thinking_action: AgentThinkAgentManagementRequestOnThinkingAction + on_speaking_action: AgentThinkAgentManagementRequestOnSpeakingAction + interruptable: bool + metadata: typing.Dict[str, str] + + + class GetTurnsOptions(typing_extensions.TypedDict, total=False): + page_index: int + page_size: int + + + class SayOptions(typing_extensions.TypedDict, total=False): + priority: SpeakAgentsRequestPriority + interruptable: bool + + + class SessionOptions(typing_extensions.TypedDict, total=False): + name: str + channel: str + token: str + agent_uid: str + remote_uids: typing.List[str] + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + # LLM sub-type aliases + LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs + LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode + McpServersItem = StartAgentsRequestPropertiesLlmMcpServersItem + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + # Think type aliases and response + ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + ThinkResponse = AgentThinkAgentManagementResponse + + from .token import generate_convo_ai_token, _parse_numeric_uid, _validate_expires_in + + + def _dump_optional_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Examples + -------- + >>> from agora_agent import Agent, OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + greeting_configs: typing.Optional[LlmGreetingConfigs] = None, + ): + self._name = name + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + self._greeting_configs = greeting_configs + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_greeting_configs(self, configs: LlmGreetingConfigs) -> "Agent": + """Returns a new Agent with greeting playback configuration.""" + new_agent = self._clone() + new_agent._greeting_configs = configs + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + The failure message is played via TTS when the LLM call fails. + """ + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Returns a new Agent with the specified maximum conversation history length.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def greeting_configs(self) -> typing.Optional[LlmGreetingConfigs]: + return self._greeting_configs + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + "greeting_configs": self._greeting_configs, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + uid=_parse_numeric_uid(agent_uid, "agent_uid"), + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._greeting_configs is not None: + llm_config["greeting_configs"] = _dump_optional_model(self._greeting_configs) + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + if self._stt is not None: + base_kwargs["asr"] = self._stt + + return StartAgentsRequestProperties(**base_kwargs) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + new_agent._greeting_configs = self._greeting_configs + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent, GetTurnsOptions, SayOptions, ThinkOptions + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token, _parse_numeric_uid + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(self._agent_uid, "agent_uid"), + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(str(params["agora_uid"]), "avatar agora_uid"), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.greeting_configs is not None: + llm["greeting_configs"] = self._dump_model(self._agent.greeting_configs) + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @staticmethod + def _response_pagination(response: typing.Any) -> typing.Any: + if isinstance(response, dict): + return response.get("pagination") + return getattr(response, "pagination", None) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + tests/custom/test_root_exports.py: | + import pytest + + import agora_agent + import agora_agent.agentkit as agentkit + + + def test_root_exports_match_agentkit_for_common_symbols() -> None: + for name in ("Agent", "DeepgramSTT", "OpenAI", "AgentPresets", "generate_rtc_token", "DataChannel"): + assert getattr(agora_agent, name) is getattr(agentkit, name) + + + def test_root_exports_fern_client_symbols() -> None: + assert agora_agent.Agora is not None + assert agora_agent.Area is not None + assert agora_agent.AsyncAgora is not None + + + def test_unknown_root_export_raises_attribute_error() -> None: + with pytest.raises(AttributeError): + _ = agora_agent.NotARealExportName + + + def test_dir_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in dir(agora_agent) + + + def test_all_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in agora_agent.__all__ + assert "OpenAI" in agora_agent.__all__ + - id: patch-d475306b + content_hash: sha256:407af5e7564d6e8d0b91f1e117cb433aec931f083225af53c6df2abfff281b22 + original_commit: d475306bd42279984bcf4934b900003e8e02c4eb + original_message: Move package rename guidance to installation docs and protect manual paths in Fern ignore. Consolidate migration notes into the installation guide with next-step links, add a brief README pointer, and exclude README, compat, and workflow files from Fern generation. + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - compat/agora-agent-server-sdk/README.md + - docs/getting-started/installation.md + patch_content: | + diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md + index e43d1d8..1da36aa 100644 + --- a/compat/agora-agent-server-sdk/README.md + +++ b/compat/agora-agent-server-sdk/README.md + @@ -14,3 +14,5 @@ This compatibility package re-exports the public API from `agora-agents` to supp + from agora_agent import Agora, Area + from agora_agent_server_sdk_compat import Agora, Area + ``` + + + +Maintainers: dual-publish steps live in the repository release workflow, not in the root README. + diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md + index 04b48da..8fca9ab 100644 + --- a/docs/getting-started/installation.md + +++ b/docs/getting-started/installation.md + @@ -53,4 +53,15 @@ from agora_agent import AsyncAgora, AsyncAgentSession, Area + | `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | + | `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | + + -See [Authentication](./authentication.md) for setup details. + +## Next steps + + + +- [Authentication](./authentication.md) — configure your credentials + +- [Quick Start](./quick-start.md) — build your first conversational agent + + + +## Migrating from a previous package name + + + +The PyPI distribution was renamed from `agora-agent-server-sdk` to `agora-agents` in v2.0.0. Install `agora-agents`; the import path remains `agora_agent`. + + + +The legacy PyPI name remains available as a compatibility shim that re-exports `agora-agents`. See [compat/agora-agent-server-sdk](../../compat/agora-agent-server-sdk/README.md). + + + +For release and version details, see [changelog — Migration notes](../../changelog.md#migration-notes). + theirs_snapshot: + compat/agora-agent-server-sdk/README.md: | + # agora-agent-server-sdk + + This package has been renamed to `agora-agents`. + + New projects should install: + + ```sh + pip install agora-agents + ``` + + This compatibility package re-exports the public API from `agora-agents` to support existing installs during the migration window. The primary import path remains `agora_agent`; you can also import from `agora_agent_server_sdk_compat`: + + ```python + from agora_agent import Agora, Area + from agora_agent_server_sdk_compat import Agora, Area + ``` + + Maintainers: dual-publish steps live in the repository release workflow, not in the root README. + docs/getting-started/installation.md: | + --- + sidebar_position: 1 + title: Installation + description: Install the Agora Conversational AI Python SDK. + --- + + # Installation + + ## Prerequisites + + - Python >= 3.8 + + ## Install with pip + + ```sh + pip install agora-agents + ``` + + ## Install with Poetry + + ```sh + poetry add agora-agents + ``` + + ## Imports + + ```python + from agora_agent import Agent, Agora, Area, DeepgramSTT, OpenAI + ``` + + The package installs as `agora-agents` and imports as `agora_agent`. + + ## Sync vs. Async + + The SDK supports both synchronous and asynchronous usage: + + - **Synchronous** — import `Agora` from `agora_agent` and use blocking method calls + - **Asynchronous** — import `AsyncAgora` and `AsyncAgentSession` from `agora_agent` and use `await` with all API calls + + ```python + # Sync + from agora_agent import Agora, Area + + # Async + from agora_agent import AsyncAgora, AsyncAgentSession, Area + ``` + + ## Dependencies + + | Package | Purpose | + | ------------------------------ | ------------------------------------------------------ | + | `httpx` (>= 0.21.2) | HTTP client for sync and async requests | + | `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | + | `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | + + ## Next steps + + - [Authentication](./authentication.md) — configure your credentials + - [Quick Start](./quick-start.md) — build your first conversational agent + + ## Migrating from a previous package name + + The PyPI distribution was renamed from `agora-agent-server-sdk` to `agora-agents` in v2.0.0. Install `agora-agents`; the import path remains `agora_agent`. + + The legacy PyPI name remains available as a compatibility shim that re-exports `agora-agents`. See [compat/agora-agent-server-sdk](../../compat/agora-agent-server-sdk/README.md). + + For release and version details, see [changelog — Migration notes](../../changelog.md#migration-notes). + status: unresolved + - id: patch-c9355576 + content_hash: sha256:83b3b6148b21f2b4d53ee67321777522f5f4e871b61ea3b23f3a6b88ca052769 + original_commit: c93555763ffd63267a737b3e430217a890f203db + original_message: Streamline Python docs and README for app-credentials-first onboarding. Remove duplicated low-level client examples from the README, de-emphasize legacy auth modes, refocus the low-level API guide on AgentKit with telephony escape hatches, and update Agora-managed model terminology. + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/getting-started/authentication.md + - docs/guides/low-level-api.md + patch_content: | + diff --git a/docs/getting-started/authentication.md b/docs/getting-started/authentication.md + index 31dcc56..74c62cd 100644 + --- a/docs/getting-started/authentication.md + +++ b/docs/getting-started/authentication.md + @@ -46,41 +46,6 @@ session = agent.create_session( + print(client.auth_mode) # "app-credentials" + ``` + + -## Other auth modes + +## Legacy auth modes + + -The SDK also supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. These are not recommended for new applications. + - + -### Token auth (`auth_token`) + - + -Pass a pre-minted Agora REST token on the client. You must also supply the RTC join token on `create_session(..., token=...)`. + - + -```python + -client = Agora( + - area=Area.US, + - app_id="your-app-id", + - app_certificate="your-app-certificate", + - auth_token="your-rest-auth-token", + -) + - + -session = agent.create_session( + - client, + - channel="room-123", + - agent_uid="1", + - remote_uids=["100"], + - token="your-rtc-join-token", + -) + -``` + - + -### Basic Auth (`customer_id` + `customer_secret`) + - + -Uses HTTP Basic Auth with Customer ID and Secret from Agora Console. Avoid for new integrations — the same credentials are sent on every request instead of minting fresh tokens. + - + -```python + -client = Agora( + - area=Area.US, + - app_id="your-app-id", + - app_certificate="your-app-certificate", + - customer_id="your-customer-id", + - customer_secret="your-customer-secret", + -) + -``` + +The generated client still supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. Do not use those modes for new session integrations. Use app credentials so AgentKit can mint short-lived ConvoAI REST auth and RTC join tokens for each session. + diff --git a/docs/guides/low-level-api.md b/docs/guides/low-level-api.md + index 6677b45..47397b7 100644 + --- a/docs/guides/low-level-api.md + +++ b/docs/guides/low-level-api.md + @@ -1,187 +1,55 @@ + --- + sidebar_position: 10 + title: Low-Level API + -description: Direct client.agents.start() usage without the builder pattern. + +description: Use generated clients for escape-hatch APIs while keeping agent sessions on AgentKit. + --- + + # Low-Level API + + -For full control over request payloads you can call the generated clients directly and pass raw types such as `StartAgentsRequestProperties`, `Tts_Elevenlabs`, and `StartAgentsRequestPropertiesAsr`. Use this when you need vendor or options not exposed by the agentkit, or when integrating with generated types from the API spec. + +Use the `Agent` builder and `AgentSession` for conversational agent starts. That path generates ConvoAI REST auth and RTC join tokens from `app_id` and `app_certificate`, so application code does not need prebuilt REST tokens, RTC tokens, Customer ID, or Customer Secret. + + -## Raw telephony and phone-number APIs + - + -AgentKit focuses on realtime agent session helpers. Telephony call status, call hangup, and phone-number management are exposed through the generated low-level clients: + - + -- `client.telephony` for call status and hangup operations + -- `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + +Generated clients are still available for API surface that AgentKit does not wrap yet, such as telephony and phone-number management. + + -## Cascading flow (ASR → LLM → TTS) + +## Client setup + + ```python + from agora_agent import Agora, Area + -from agora_agent.agents import ( + - StartAgentsRequestProperties, + - StartAgentsRequestPropertiesAsr, + - StartAgentsRequestPropertiesLlm, + -) + -from agora_agent.types.eleven_labs_tts_params import ElevenLabsTtsParams + -from agora_agent.types.tts import Tts_Elevenlabs + + client = Agora( + area=Area.US, + - app_id="YOUR_APP_ID", + - app_certificate="YOUR_APP_CERTIFICATE", + - auth_token="your-rest-auth-token", + -) + -client.agents.start( + - client.app_id, + - name="unique_name", + - properties=StartAgentsRequestProperties( + - channel="channel_name", + - token="token", + - agent_rtc_uid="1001", + - remote_rtc_uids=["1002"], + - idle_timeout=120, + - asr=StartAgentsRequestPropertiesAsr( + - language="en-US", + - vendor="deepgram", + - params={"api_key": "YOUR_DEEPGRAM_API_KEY"}, + - ), + - tts=Tts_Elevenlabs( + - params=ElevenLabsTtsParams( + - key="YOUR_ELEVENLABS_API_KEY", + - model_id="eleven_flash_v2_5", + - voice_id="pNInz6obpgDQGcFmaJgB", + - sample_rate=24000, + - ), + - ), + - llm=StartAgentsRequestPropertiesLlm( + - url="https://api.openai.com/v1/chat/completions", + - api_key="", + - system_messages=[ + - {"role": "system", "content": "You are a helpful chatbot."} + - ], + - params={"model": "gpt-4o-mini"}, + - max_history=32, + - greeting_message="Hello, how can I assist you today?", + - failure_message="Please hold on a second.", + - ), + - ), + + app_id="your-app-id", + + app_certificate="your-app-certificate", + ) + ``` + + -## Async (low-level) + +## Raw telephony and phone-number APIs + + -```python + -import asyncio + -from agora_agent import Area, AsyncAgora + -from agora_agent.agents import ( + - StartAgentsRequestProperties, + - StartAgentsRequestPropertiesAsr, + - StartAgentsRequestPropertiesLlm, + -) + -from agora_agent.types.eleven_labs_tts_params import ElevenLabsTtsParams + -from agora_agent.types.tts import Tts_Elevenlabs + +AgentKit focuses on realtime agent session helpers. Use generated clients for operational APIs: + + -client = AsyncAgora( + - area=Area.US, + - app_id="YOUR_APP_ID", + - app_certificate="YOUR_APP_CERTIFICATE", + - auth_token="your-rest-auth-token", + +- `client.telephony` for call status and hangup operations + +- `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + + + +```python + +calls = client.telephony.list( + + appid=client.app_id, + + type="sip", + ) + + -async def main() -> None: + - await client.agents.start( + - client.app_id, + - name="unique_name", + - properties=StartAgentsRequestProperties( + - channel="channel_name", + - token="token", + - agent_rtc_uid="1001", + - remote_rtc_uids=["1002"], + - idle_timeout=120, + - asr=StartAgentsRequestPropertiesAsr( + - language="en-US", + - vendor="deepgram", + - params={"api_key": "YOUR_DEEPGRAM_API_KEY"}, + - ), + - tts=Tts_Elevenlabs( + - params=ElevenLabsTtsParams( + - key="YOUR_ELEVENLABS_API_KEY", + - model_id="eleven_flash_v2_5", + - voice_id="pNInz6obpgDQGcFmaJgB", + - sample_rate=24000, + - ), + - ), + - llm=StartAgentsRequestPropertiesLlm( + - url="https://api.openai.com/v1/chat/completions", + - api_key="", + - system_messages=[ + - {"role": "system", "content": "You are a helpful chatbot."} + - ], + - params={"model": "gpt-4o-mini"}, + - max_history=32, + - greeting_message="Hello, how can I assist you today?", + - failure_message="Please hold on a second.", + - ), + - ), + - ) + - + -asyncio.run(main()) + +for call in calls: + + print(call.id, call.state) + ``` + + -## MLLM flow (multimodal) + +## Direct agent APIs + + -For real-time audio with OpenAI Realtime or Google Gemini Live, use the MLLM flow instead of the cascading ASR → LLM → TTS flow. See the [MLLM Overview](https://docs.agora.io/en/conversational-ai/models/mllm/overview). + +`client.agents` exposes the generated REST surface for advanced integrations. Prefer `agent.create_session(...).start()` for new session starts because it handles auth, token generation, vendor serialization, lifecycle state, and avatar enrichment. + + -```python + -from agora_agent import Agora, Area + -from agora_agent.agents import ( + - StartAgentsRequestProperties, + - StartAgentsRequestPropertiesMllm, + - StartAgentsRequestPropertiesMllmVendor, + - StartAgentsRequestPropertiesTts, + - StartAgentsRequestPropertiesTtsVendor, + - StartAgentsRequestPropertiesLlm, + -) + +If you need an endpoint that is not wrapped by `AgentSession`, use `session.raw` after creating the session: + + -client = Agora( + - area=Area.US, + - app_id="YOUR_APP_ID", + - app_certificate="YOUR_APP_CERTIFICATE", + - auth_token="your-rest-auth-token", + -) + - + -client.agents.start( + - client.app_id, + - name="mllm_agent", + - properties=StartAgentsRequestProperties( + - channel="channel_name", + - token="your_token", + - agent_rtc_uid="1001", + - remote_rtc_uids=["1002"], + - idle_timeout=120, + - mllm=StartAgentsRequestPropertiesMllm( + - enable=True, + - url="wss://api.openai.com/v1/realtime", + - api_key="", + - vendor=StartAgentsRequestPropertiesMllmVendor.OPENAI, + - params={ + - "model": "gpt-4o-realtime-preview", + - "voice": "alloy", + - }, + - input_modalities=["audio"], + - output_modalities=["text", "audio"], + - greeting_message="Hello! I'm ready to chat in real-time.", + - turn_detection={ + - "mode": "server_vad", + - "server_vad_config": { + - "idle_timeout_ms": 5000, + - }, + - }, + - ), + - ), + +```python + +info = session.raw.get( + + appid=session.app_id, + + agent_id=session.id, + ) + ``` + + -For more on the agentkit-based MLLM flow, see [MLLM Flow](./mllm-flow.md). + +You must pass `appid` and `agent_id` manually when using generated raw methods. + theirs_snapshot: + docs/getting-started/authentication.md: | + --- + sidebar_position: 2 + title: Authentication + description: Configure the Python SDK with app credentials and understand other supported auth modes. + --- + + # Authentication + + Create `Agora` or `AsyncAgora` with `app_id` and `app_certificate` only. The SDK mints a fresh ConvoAI REST token for each API call and generates the RTC join token when the session starts. + + ## App credentials + + ```python + from agora_agent import Agent, Agora, Area, DeepgramSTT, OpenAI, MiniMaxTTS + + client = Agora( + area=Area.US, + app_id="your-app-id", + app_certificate="your-app-certificate", + ) + + agent = ( + Agent(instructions="Be concise.") + .with_stt(DeepgramSTT(model="nova-3")) + .with_llm(OpenAI(model="gpt-4o-mini")) + .with_tts(MiniMaxTTS(model="speech_2_6_turbo", voice_id="English_captivating_female1")) + ) + + session = agent.create_session( + client, + channel="room-123", + agent_uid="1", + remote_uids=["100"], + ) + ``` + + ## Why app credentials + + - Fresh short-lived tokens per API call instead of reusing long-lived credentials + - No Customer ID / Customer Secret in request headers + - No manual REST or RTC token provisioning in application code + + ## Inspecting auth mode + + ```python + print(client.auth_mode) # "app-credentials" + ``` + + ## Legacy auth modes + + The generated client still supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. Do not use those modes for new session integrations. Use app credentials so AgentKit can mint short-lived ConvoAI REST auth and RTC join tokens for each session. + docs/guides/low-level-api.md: | + --- + sidebar_position: 10 + title: Low-Level API + description: Use generated clients for escape-hatch APIs while keeping agent sessions on AgentKit. + --- + + # Low-Level API + + Use the `Agent` builder and `AgentSession` for conversational agent starts. That path generates ConvoAI REST auth and RTC join tokens from `app_id` and `app_certificate`, so application code does not need prebuilt REST tokens, RTC tokens, Customer ID, or Customer Secret. + + Generated clients are still available for API surface that AgentKit does not wrap yet, such as telephony and phone-number management. + + ## Client setup + + ```python + from agora_agent import Agora, Area + + client = Agora( + area=Area.US, + app_id="your-app-id", + app_certificate="your-app-certificate", + ) + ``` + + ## Raw telephony and phone-number APIs + + AgentKit focuses on realtime agent session helpers. Use generated clients for operational APIs: + + - `client.telephony` for call status and hangup operations + - `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + + ```python + calls = client.telephony.list( + appid=client.app_id, + type="sip", + ) + + for call in calls: + print(call.id, call.state) + ``` + + ## Direct agent APIs + + `client.agents` exposes the generated REST surface for advanced integrations. Prefer `agent.create_session(...).start()` for new session starts because it handles auth, token generation, vendor serialization, lifecycle state, and avatar enrichment. + + If you need an endpoint that is not wrapped by `AgentSession`, use `session.raw` after creating the session: + + ```python + info = session.raw.get( + appid=session.app_id, + agent_id=session.id, + ) + ``` + + You must pass `appid` and `agent_id` manually when using generated raw methods. + status: unresolved diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index f84862c..0d7a4aa 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -67,6 +67,8 @@ from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule from ..types.tts import Tts +from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule +from ..types.tts import Tts from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( AgentThinkAgentManagementRequestOnListeningAction, ) diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py index a749d1e..ddcd930 100644 --- a/src/agora_agent/agentkit/agent_session.py +++ b/src/agora_agent/agentkit/agent_session.py @@ -24,6 +24,7 @@ is_generic_avatar, is_heygen_avatar, is_live_avatar_avatar, + is_rtc_avatar, validate_avatar_config, validate_tts_sample_rate, ) diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py index 50bdd08..f48098c 100644 --- a/src/agora_agent/agentkit/vendors/avatar.py +++ b/src/agora_agent/agentkit/vendors/avatar.py @@ -177,6 +177,49 @@ def to_config(self) -> Dict[str, Any]: return {"enable": enable, "vendor": "generic", "params": params} +class GenericAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Generic avatar provider API key") + api_base_url: str = Field(..., description="Avatar provider API base URL") + avatar_id: str = Field(..., description="Avatar ID") + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + +class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "api_base_url": self.options.api_base_url, + "avatar_id": self.options.avatar_id, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_appid is not None: + params["agora_appid"] = self.options.agora_appid + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.agora_channel is not None: + params["agora_channel"] = self.options.agora_channel + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "generic", "params": params} + + class AnamAvatarOptions(BaseModel): model_config = ConfigDict(extra="forbid") diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py index b58f040..62cb3f2 100644 --- a/src/agora_agent/agentkit/vendors/mllm.py +++ b/src/agora_agent/agentkit/vendors/mllm.py @@ -1,3 +1,4 @@ +import warnings from typing import Any, Dict, List, Optional from pydantic import BaseModel, ConfigDict, Field diff --git a/tests/custom/test_agentkit_agent.py b/tests/custom/test_agentkit_agent.py new file mode 100644 index 0000000..9719b04 --- /dev/null +++ b/tests/custom/test_agentkit_agent.py @@ -0,0 +1,298 @@ +from agora_agent.agentkit import ( + Agent, + AvatarConfig, + AvatarVendor, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, + SttConfig, + SttVendor, + TtsConfig, +) +import pytest + +from agora_agent.agentkit.vendors import ( + AkoolAvatar, + ElevenLabsTTS, + LiveAvatarAvatar, + OpenAI, + OpenAIRealtime, +) + + +def _parameter(config, key): + parameters = config["parameters"] + if isinstance(parameters, dict): + return parameters[key] + return getattr(parameters, key) + + +class _CopyOnlyModel: + def __init__(self, **values): + self.values = values + + def copy(self, update=None): + return _CopyOnlyModel(**{**self.values, **(update or {})}) + + +def test_generated_core_aliases_are_public(): + assert LlmConfig is not None + assert LlmStyle is not None + assert SttConfig is not None + assert SttVendor is not None + assert TtsConfig is not None + assert MllmConfig is not None + assert MllmVendor is not None + assert AvatarConfig is not None + assert AvatarVendor is not None + + +def test_model_copy_helper_supports_pydantic_v1_copy_api(): + copied = Agent._copy_model_update(_CopyOnlyModel(enable_rtm=True), {"data_channel": "rtm"}) # noqa: SLF001 + + assert copied.values == {"enable_rtm": True, "data_channel": "rtm"} + + +def test_with_audio_scenario_sets_session_parameter(): + agent = Agent(name="test").with_audio_scenario("chorus") + + assert _parameter(agent.config, "audio_scenario") == "chorus" + + +def test_with_audio_scenario_preserves_existing_parameters(): + agent = Agent(name="test", parameters={"enable_metrics": True}).with_audio_scenario( + "chorus" + ) + + assert _parameter(agent.config, "enable_metrics") is True + assert _parameter(agent.config, "audio_scenario") == "chorus" + + +def test_enable_rtm_defaults_data_channel_to_rtm(): + properties = Agent(name="test", advanced_features={"enable_rtm": True}).to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + skip_vendor_validation=True, + ) + + assert properties.parameters.data_channel == "rtm" + + +def test_enable_rtm_preserves_explicit_data_channel(): + properties = Agent( + name="test", + advanced_features={"enable_rtm": True}, + parameters={"data_channel": "datastream"}, + ).to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + skip_vendor_validation=True, + ) + + assert properties.parameters.data_channel == "datastream" + + +def test_agent_level_llm_fields_override_vendor_defaults(): + agent = ( + Agent(name="test") + .with_llm( + OpenAI( + api_key="llm-key", + greeting_message="vendor greeting", + failure_message="vendor failure", + max_history=1, + ) + ) + .with_tts(ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + + properties = agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + ) + + assert properties.llm.greeting_message == "agent greeting" + assert properties.llm.failure_message == "agent failure" + assert properties.llm.max_history == 2 + + +def test_avatar_sample_rate_validation_works_when_tts_added_after_avatar(): + agent = Agent(name="test").with_avatar( + LiveAvatarAvatar(api_key="avatar-key", quality="medium", agora_uid="2") + ) + + with pytest.raises(ValueError, match="24000"): + agent.with_tts( + ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice", sample_rate=16000) + ) + + +def test_avatar_sample_rate_validation_uses_wrapper_sample_rate(): + agent = ( + Agent(name="test") + .with_avatar(AkoolAvatar(api_key="avatar-key")) + .with_tts( + ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice", sample_rate=16000) + ) + ) + + assert agent.tts_sample_rate == 16000 + + +def test_with_mllm_removes_deprecated_advanced_features_enable_mllm(): + properties = ( + Agent( + name="test", + advanced_features={"enable_mllm": True, "enable_rtm": True}, + greeting="hello from agent", + failure_message="try again", + max_history=5, + ) + .with_mllm(OpenAIRealtime(api_key="openai-key")) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None + assert properties.mllm.enable is True + assert properties.mllm.greeting_message == "hello from agent" + assert properties.mllm.failure_message == "try again" + mllm_dump = properties.mllm.model_dump(exclude_none=True) + assert "max_history" not in mllm_dump + assert properties.advanced_features is not None + af_dump = properties.advanced_features.model_dump(exclude_none=True) + assert "enable_mllm" not in af_dump + assert af_dump.get("enable_rtm") is True + + +def test_to_properties_rejects_mllm_with_enabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + + +def test_to_properties_mllm_with_avatar_fires_before_token_generation(): + """The guard must fire before the token-generation step so callers get a + clear, actionable error even when app_id/app_certificate are empty. + """ + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + app_id="", + app_certificate="", + ) + + +def test_to_properties_rejects_mllm_with_default_enabled_avatar(): + """Avatar with no `enable` field should be treated as enabled.""" + agent = Agent(name="test").with_mllm(OpenAIRealtime(api_key="mllm-key")) + agent._avatar = { # noqa: SLF001 + "vendor": "liveavatar", + "params": { + "api_key": "avatar-key", + "quality": "high", + "agora_uid": "200", + "agora_token": "avatar-token", + }, + } + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + + +def test_to_properties_allows_mllm_with_disabled_avatar_and_no_tts(): + properties = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + enable=False, + ) + ) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None and properties.mllm.enable is True + assert properties.tts is None + assert properties.llm is None + assert properties.asr is None + assert properties.avatar is not None and properties.avatar.enable is False + + +def test_to_properties_mllm_without_tts_or_llm_succeeds(): + properties = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None and properties.mllm.enable is True + assert properties.tts is None + assert properties.llm is None + assert properties.asr is None + assert properties.avatar is None diff --git a/tests/custom/test_agentkit_session.py b/tests/custom/test_agentkit_session.py new file mode 100644 index 0000000..198fcd0 --- /dev/null +++ b/tests/custom/test_agentkit_session.py @@ -0,0 +1,383 @@ +from types import SimpleNamespace + +import pytest + +from agora_agent.agentkit import Agent, AgentSession +from agora_agent.agentkit.vendors import ( + ElevenLabsTTS, + GenericAvatar, + LiveAvatarAvatar, + OpenAI, + OpenAIRealtime, + RimeTTS, +) +from agora_agent.agents.types.get_turns_agents_response import GetTurnsAgentsResponse + + +APP_ID = "0" * 32 +APP_CERTIFICATE = "1" * 32 + + +class _Agents: + def __init__(self): + self.calls = [] + self.start_calls = [] + + def start(self, app_id, name, properties, preset=None, pipeline_id=None, request_options=None): + self.start_calls.append((app_id, name, properties, preset, pipeline_id, request_options)) + return SimpleNamespace(agent_id="agent-1") + + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + self.calls.append((app_id, agent_id, page_index, page_size, request_options)) + is_last_page = page_index != 1 + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=2, + pagination={ + "page_index": page_index or 1, + "total_pages": 2, + "is_last_page": is_last_page, + }, + turns=[{"turn_id": float(page_index or 1)}], + ) + + +class _Client: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _Agents() + self.agent_management = object() + + +def _session(agent, warn=None): + return AgentSession( + client=_Client(), + agent=agent, + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + warn=warn, + ) + + +def test_generic_avatar_enrichment_adds_session_context_and_token(): + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + ) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + params = properties["avatar"]["params"] + assert params["agora_appid"] == APP_ID + assert params["agora_channel"] == "room" + assert params["agora_token"] + assert params["agora_token"] != properties["token"] + + +def test_generic_avatar_empty_session_fields_are_filled(): + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + agora_appid="", + agora_channel="", + agora_token="", + ) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + params = properties["avatar"]["params"] + assert params["agora_appid"] == APP_ID + assert params["agora_channel"] == "room" + assert params["agora_token"] + + +def test_avatar_uid_matching_agent_uid_warns(): + warnings = [] + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="1", + ) + ) + session = _session(agent, warn=warnings.append) + + session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert any("matches agent_rtc_uid" in warning for warning in warnings) + + +def test_session_start_properties_agent_level_llm_fields_override_vendor_defaults(): + agent = ( + Agent(name="test") + .with_llm( + OpenAI( + api_key="llm-key", + greeting_message="vendor greeting", + failure_message="vendor failure", + max_history=1, + ) + ) + .with_tts(ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["llm"]["greeting_message"] == "agent greeting" + assert properties["llm"]["failure_message"] == "agent failure" + assert properties["llm"]["max_history"] == 2 + + +def test_session_start_properties_applies_mllm_agent_level_defaults(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["mllm"]["greeting_message"] == "agent greeting" + assert properties["mllm"]["failure_message"] == "agent failure" + assert "max_history" not in properties["mllm"] + + +def test_session_start_properties_preserves_mllm_vendor_defaults(): + agent = ( + Agent(name="test") + .with_mllm( + OpenAIRealtime( + api_key="mllm-key", + greeting_message="vendor greeting", + failure_message="vendor failure", + ) + ) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["mllm"]["greeting_message"] == "vendor greeting" + assert properties["mllm"]["failure_message"] == "vendor failure" + assert "max_history" not in properties["mllm"] + + +def test_session_start_allows_mllm_without_tts(): + agent = Agent(name="test").with_mllm(OpenAIRealtime(api_key="mllm-key")) + session = _session(agent) + + assert session.start() == "agent-1" + + +def test_session_start_rejects_mllm_with_enabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + session = _session(agent) + + with pytest.raises(ValueError, match="cascading"): + session.start() + assert session._client.agents.start_calls == [] # noqa: SLF001 + + +def test_session_start_allows_mllm_with_disabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + enable=False, + ) + ) + ) + session = _session(agent) + + assert session.start() == "agent-1" + + +def test_avatar_sample_rate_validation_uses_serialized_vendor_keys(): + warnings = [] + agent = ( + Agent(name="test") + .with_avatar(LiveAvatarAvatar(api_key="avatar-key", quality="medium", agora_uid="2")) + .with_tts(RimeTTS(key="tts-key", speaker="speaker", sampling_rate=24000)) + ) + session = _session(agent, warn=warnings.append) + + session._validate_avatar_config() # noqa: SLF001 + + assert warnings == [] + + +def test_avatar_user_token_is_not_overwritten(): + agent = Agent(name="test").with_avatar( + LiveAvatarAvatar( + api_key="live-key", + quality="medium", + agora_uid="2", + agora_token="user-token", + ) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["avatar"]["params"]["agora_token"] == "user-token" + + +def test_get_turns_forwards_pagination_args(): + session = _session(Agent(name="test")) + session._agent_id = "agent-id" # noqa: SLF001 + + session.get_turns(page_index=3, page_size=25) + + assert session._client.agents.calls[-1][:4] == (APP_ID, "agent-id", 3, 25) # noqa: SLF001 + + +def test_get_all_turns_aggregates_pages(): + session = _session(Agent(name="test")) + session._agent_id = "agent-id" # noqa: SLF001 + + response = session.get_all_turns(page_size=1) + + assert [turn.turn_id for turn in response.turns] == [1.0, 2.0] + assert response.pagination.page_index == 2 + + +def test_get_all_turns_raises_when_pagination_does_not_advance(): + class _StuckAgents: + def __init__(self): + self.calls = 0 + + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + self.calls += 1 + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=2, + pagination={"page_index": 1, "is_last_page": False}, + turns=[{"turn_id": 1.0}], + ) + + class _StuckClient: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _StuckAgents() + self.agent_management = object() + + session = AgentSession( + client=_StuckClient(), + agent=Agent(name="test"), + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + ) + session._agent_id = "agent-id" # noqa: SLF001 + + with pytest.raises(RuntimeError, match="did not advance"): + session.get_all_turns(page_size=1) + + +def test_get_all_turns_raises_when_pagination_metadata_missing(): + class _NoMetaAgents: + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=1, + pagination={"is_last_page": False}, + turns=[{"turn_id": 1.0}], + ) + + class _NoMetaClient: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _NoMetaAgents() + self.agent_management = object() + + session = AgentSession( + client=_NoMetaClient(), + agent=Agent(name="test"), + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + ) + session._agent_id = "agent-id" # noqa: SLF001 + + with pytest.raises(RuntimeError, match="cannot continue"): + session.get_all_turns(page_size=1) diff --git a/tests/custom/test_agentkit_vendors.py b/tests/custom/test_agentkit_vendors.py new file mode 100644 index 0000000..8473821 --- /dev/null +++ b/tests/custom/test_agentkit_vendors.py @@ -0,0 +1,122 @@ +import pytest +from pydantic import ValidationError + +from agora_agent.agentkit import LlmGreetingConfigs +import warnings + +from agora_agent.agentkit.vendors import GenericAvatar, OpenAI, OpenAIRealtime, XaiGrok, XaiRealtime + + +def test_xai_grok_serializes_v27_shape_without_style(): + config = XaiGrok( + api_key="xai-key", + voice="eve", + language="en", + sample_rate=24000, + output_modalities=["audio", "text"], + params={"temperature": 0.2}, + ).to_config() + + assert config["vendor"] == "xai" + assert config["url"] == "wss://api.x.ai/v1/realtime" + assert config["api_key"] == "xai-key" + assert config["params"] == { + "temperature": 0.2, + "voice": "eve", + "language": "en", + "sample_rate": 24000, + } + assert config["output_modalities"] == ["audio", "text"] + assert "style" not in config + + +def test_xai_grok_emits_params_even_when_empty(): + assert XaiGrok(api_key="xai-key").to_config()["params"] == {} + + +def test_xai_realtime_deprecated_alias_emits_same_vendor(): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always", DeprecationWarning) + config = XaiRealtime(api_key="xai-key").to_config() + assert len(caught) == 1 + assert issubclass(caught[0].category, DeprecationWarning) + assert config["vendor"] == "xai" + + +def test_mllm_rejects_fields_not_in_core_contract(): + with pytest.raises(ValidationError): + OpenAIRealtime(api_key="openai-key", predefined_tools=["_publish_message"]) + + with pytest.raises(ValidationError): + XaiGrok(api_key="xai-key", max_history=10) + + +def test_generic_avatar_omits_session_enriched_fields_when_unset(): + config = GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + ).to_config() + + assert config == { + "enable": True, + "vendor": "generic", + "params": { + "api_key": "avatar-key", + "api_base_url": "https://avatar.example.com", + "avatar_id": "avatar-1", + "agora_uid": "2", + }, + } + + +def test_vertex_ai_explicit_fields_override_additional_params(): + from agora_agent.agentkit.vendors import VertexAI + + config = VertexAI( + model="explicit-model", + project_id="explicit-project", + location="explicit-region", + adc_credentials_string="{}", + additional_params={ + "model": "should-be-overridden", + "project_id": "should-be-overridden", + "location": "should-be-overridden", + "adc_credentials_string": "should-be-overridden", + "extra_key": "kept", + }, + ).to_config() + + assert config["vendor"] == "vertexai" + assert config["params"]["model"] == "explicit-model" + assert config["params"]["project_id"] == "explicit-project" + assert config["params"]["location"] == "explicit-region" + assert config["params"]["adc_credentials_string"] == "{}" + assert config["params"]["extra_key"] == "kept" + + +def test_gemini_live_explicit_fields_override_additional_params(): + from agora_agent.agentkit.vendors import GeminiLive + + config = GeminiLive( + api_key="key", + model="explicit-model", + additional_params={ + "model": "should-be-overridden", + "extra_key": "kept", + }, + ).to_config() + + assert config["params"]["model"] == "explicit-model" + assert config["params"]["extra_key"] == "kept" + + +def test_llm_greeting_configs_interruptable_serializes(): + config = OpenAI( + api_key="openai-key", + greeting_configs=LlmGreetingConfigs(mode="single_first", interruptable=False), + ).to_config() + + assert config["greeting_configs"]["mode"] == "single_first" + assert config["greeting_configs"]["interruptable"] is False diff --git a/tests/custom/test_avatar_token.py b/tests/custom/test_avatar_token.py new file mode 100644 index 0000000..fa73fc0 --- /dev/null +++ b/tests/custom/test_avatar_token.py @@ -0,0 +1,12 @@ +from agora_agent.agentkit import generate_convo_ai_token + + +def test_avatar_tokens_use_convo_ai_token_path_with_avatar_uid(): + token = generate_convo_ai_token( + app_id="0" * 32, + app_certificate="1" * 32, + channel_name="room", + uid=123, + ) + + assert token.startswith("007") diff --git a/tests/custom/test_llm_vendors.py b/tests/custom/test_llm_vendors.py new file mode 100644 index 0000000..faca9bf --- /dev/null +++ b/tests/custom/test_llm_vendors.py @@ -0,0 +1,60 @@ +from agora_agent import AmazonBedrock, CustomLLM, Dify, Groq, VertexAILLM + + +def test_groq_serializes_as_openai_compatible() -> None: + config = Groq(api_key="groq-key", model="llama-3.3-70b-versatile").to_config() + + assert config["url"] == "https://api.groq.com/openai/v1/chat/completions" + assert config["api_key"] == "groq-key" + assert config["style"] == "openai" + assert config["params"]["model"] == "llama-3.3-70b-versatile" + + +def test_custom_llm_marks_request_as_custom() -> None: + config = CustomLLM(api_key="key", model="model", base_url="https://llm.example.com/chat").to_config() + + assert config["url"] == "https://llm.example.com/chat" + assert config["api_key"] == "key" + assert config["vendor"] == "custom" + assert config["style"] == "openai" + + +def test_vertex_ai_llm_includes_project_routing() -> None: + config = VertexAILLM( + api_key="vertex-token", + model="gemini-2.0-flash", + project_id="project", + location="us-central1", + ).to_config() + + assert config["api_key"] == "vertex-token" + assert config["style"] == "gemini" + assert config["params"]["model"] == "gemini-2.0-flash" + assert config["params"]["project_id"] == "project" + assert config["params"]["location"] == "us-central1" + + +def test_amazon_bedrock_serializes_as_anthropic_style() -> None: + config = AmazonBedrock( + api_key="bedrock-key", + url="https://bedrock.example.com/messages", + model="anthropic.claude-3-5-sonnet-20241022-v2:0", + ).to_config() + + assert config["api_key"] == "bedrock-key" + assert config["style"] == "anthropic" + assert config["params"]["model"] == "anthropic.claude-3-5-sonnet-20241022-v2:0" + + +def test_dify_serializes_conversation_fields() -> None: + config = Dify( + api_key="dify-key", + url="https://api.dify.ai/v1/chat-messages", + user="user-1", + conversation_id="conversation-1", + ).to_config() + + assert config["api_key"] == "dify-key" + assert config["style"] == "dify" + assert config["params"]["user"] == "user-1" + assert config["params"]["conversation_id"] == "conversation-1" diff --git a/tests/custom/test_root_exports.py b/tests/custom/test_root_exports.py new file mode 100644 index 0000000..9b2f508 --- /dev/null +++ b/tests/custom/test_root_exports.py @@ -0,0 +1,29 @@ +import pytest + +import agora_agent +import agora_agent.agentkit as agentkit + + +def test_root_exports_match_agentkit_for_common_symbols() -> None: + for name in ("Agent", "DeepgramSTT", "OpenAI", "AgentPresets", "generate_rtc_token", "DataChannel"): + assert getattr(agora_agent, name) is getattr(agentkit, name) + + +def test_root_exports_fern_client_symbols() -> None: + assert agora_agent.Agora is not None + assert agora_agent.Area is not None + assert agora_agent.AsyncAgora is not None + + +def test_unknown_root_export_raises_attribute_error() -> None: + with pytest.raises(AttributeError): + _ = agora_agent.NotARealExportName + + +def test_dir_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in dir(agora_agent) + + +def test_all_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in agora_agent.__all__ + assert "OpenAI" in agora_agent.__all__ From 198f367ff94bd554608b7bc73a5c692380f5f81f Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Mon, 1 Jun 2026 22:24:08 -0400 Subject: [PATCH 11/26] Update AgentKit TTS provider docs and examples --- docs/concepts/agent.md | 4 ++-- docs/concepts/session.md | 2 +- docs/concepts/vendors.md | 13 ++++++------ docs/guides/agent-builder-features.md | 18 ++++++++-------- docs/guides/avatars.md | 5 ++++- docs/guides/byok.md | 1 + docs/guides/cascading-flow.md | 4 ++-- docs/reference/agent.md | 2 +- docs/reference/vendors.md | 25 ++++++++++++++++------- src/agora_agent/agentkit/agent.py | 2 +- src/agora_agent/agentkit/agent_session.py | 4 ++-- 11 files changed, 48 insertions(+), 32 deletions(-) diff --git a/docs/concepts/agent.md b/docs/concepts/agent.md index b89f08c..b45a1b7 100644 --- a/docs/concepts/agent.md +++ b/docs/concepts/agent.md @@ -88,7 +88,7 @@ agent = ( model='gpt-4o-mini', system_messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}], )) - .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id')) + .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1')) .with_stt(DeepgramSTT(api_key='your-deepgram-key', language='en-US')) ) ``` @@ -110,7 +110,7 @@ base = ( model='gpt-4o-mini', system_messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}], )) - .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id')) + .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1')) .with_stt(DeepgramSTT(api_key='your-deepgram-key', language='en-US')) ) diff --git a/docs/concepts/session.md b/docs/concepts/session.md index a513e85..ae43bc6 100644 --- a/docs/concepts/session.md +++ b/docs/concepts/session.md @@ -46,7 +46,7 @@ agent = ( model='gpt-4o-mini', system_messages=[{'role': 'system', 'content': 'You are helpful.'}], )) - .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id')) + .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1')) .with_stt(DeepgramSTT(api_key='your-deepgram-key', language='en-US')) ) diff --git a/docs/concepts/vendors.md b/docs/concepts/vendors.md index a6268f8..115c0ca 100644 --- a/docs/concepts/vendors.md +++ b/docs/concepts/vendors.md @@ -44,15 +44,15 @@ Used with `agent.with_tts()`. Each TTS vendor produces audio at a specific sampl | Class | Provider | Required Parameters | Sample Rate | |---|---|---|---| -| `ElevenLabsTTS` | ElevenLabs | `key`, `model_id`, `voice_id` | 16000, 22050, 24000, or 44100 Hz | +| `ElevenLabsTTS` | ElevenLabs | `key`, `model_id`, `voice_id`, `base_url` | 16000, 22050, 24000, or 44100 Hz | | `MicrosoftTTS` | Microsoft Azure | `key`, `region`, `voice_name` | 8000, 16000, 24000, or 48000 Hz | | `OpenAITTS` | OpenAI | `key`, `voice` | 24000 Hz (fixed) | -| `CartesiaTTS` | Cartesia | `key`, `voice_id` | 8000–48000 Hz | +| `CartesiaTTS` | Cartesia | `api_key`, `voice_id`, `model_id` | 8000–48000 Hz | | `GoogleTTS` | Google Cloud | `key`, `voice_name` | — | -| `AmazonTTS` | Amazon Polly | `access_key`, `secret_key`, `region`, `voice_id` | — | -| `HumeAITTS` | Hume AI | `key` | — | -| `RimeTTS` | Rime | `key`, `speaker` | — | -| `FishAudioTTS` | Fish Audio | `key`, `reference_id` | — | +| `AmazonTTS` | Amazon Polly | `access_key`, `secret_key`, `region`, `voice_id`, `engine` | — | +| `HumeAITTS` | Hume AI | `key`, `voice_id`, `provider` | — | +| `RimeTTS` | Rime | `key`, `speaker`, `model_id` | — | +| `FishAudioTTS` | Fish Audio | `key`, `reference_id`, `backend` | — | | `GroqTTS` | Groq | `key` | — | | `MiniMaxTTS` | MiniMax | `key` | — | | `DeepgramTTS` | Deepgram | `api_key`, `model` | Configurable | @@ -66,6 +66,7 @@ tts = ElevenLabsTTS( key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', + base_url='wss://api.elevenlabs.io/v1', sample_rate=24000, ) ``` diff --git a/docs/guides/agent-builder-features.md b/docs/guides/agent-builder-features.md index a19c140..e356d2b 100644 --- a/docs/guides/agent-builder-features.md +++ b/docs/guides/agent-builder-features.md @@ -56,7 +56,7 @@ agent = ( model='gpt-4o-mini', system_messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}], )) - .with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', sample_rate=24000)) + .with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='your-key', model='nova-2', language='en-US')) ) ``` @@ -109,7 +109,7 @@ agent = ( data_channel=DataChannel.RTM, # or DataChannel.DATASTREAM )) .with_llm(OpenAI(api_key='...', model='gpt-4o-mini')) - .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', sample_rate=24000)) + .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='...', model='nova-2')) ) ``` @@ -125,7 +125,7 @@ agent = ( failure_message='Something went wrong.', max_history=15, )) - .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', sample_rate=24000)) + .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='...', model='nova-2')) ) ``` @@ -141,7 +141,7 @@ agent = ( Agent() .with_geofence(GeofenceConfig(area=GeofenceArea.NORTH_AMERICA)) .with_llm(OpenAI(api_key='...', model='gpt-4o-mini')) - .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', sample_rate=24000)) + .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='...', model='nova-2')) ) @@ -150,7 +150,7 @@ agent = ( Agent() .with_geofence(GeofenceConfig(area=GeofenceArea.GLOBAL, exclude_area=GeofenceExcludeArea.EUROPE)) .with_llm(OpenAI(api_key='...', model='gpt-4o-mini')) - .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', sample_rate=24000)) + .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='...', model='nova-2')) ) ``` @@ -170,7 +170,7 @@ agent = ( 'version': '1.2.0', }) .with_llm(OpenAI(api_key='...', model='gpt-4o-mini')) - .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', sample_rate=24000)) + .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='...', model='nova-2')) ) ``` @@ -189,7 +189,7 @@ agent = ( encryption_mode=5, # AES_128_GCM )) .with_llm(OpenAI(api_key='...', model='gpt-4o-mini')) - .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', sample_rate=24000)) + .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='...', model='nova-2')) ) ``` @@ -226,7 +226,7 @@ agent = ( ), )) .with_llm(OpenAI(api_key='...', model='gpt-4o-mini')) - .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', sample_rate=24000)) + .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='...', model='nova-2')) ) ``` @@ -295,7 +295,7 @@ agent = ( failure_message='Sorry, I had trouble processing that.', max_history=20, )) - .with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', sample_rate=24000)) + .with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='your-key', model='nova-2', language='en-US')) .with_advanced_features(AdvancedFeatures(enable_rtm=True)) .with_parameters(SessionParams( diff --git a/docs/guides/avatars.md b/docs/guides/avatars.md index fe74e95..978f6b2 100644 --- a/docs/guides/avatars.md +++ b/docs/guides/avatars.md @@ -64,6 +64,7 @@ agent = ( key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', + base_url='wss://api.elevenlabs.io/v1', sample_rate=24000, # Must be 24000 for HeyGen )) .with_stt(DeepgramSTT(api_key='your-deepgram-key', language='en-US')) @@ -114,6 +115,7 @@ agent = ( key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', + base_url='wss://api.elevenlabs.io/v1', sample_rate=16000, # Must be 16000 for Akool )) .with_stt(DeepgramSTT(api_key='your-deepgram-key', language='en-US')) @@ -142,6 +144,7 @@ agent = ( key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', + base_url='wss://api.elevenlabs.io/v1', sample_rate=16000, # 16 kHz )) .with_stt(DeepgramSTT(api_key='your-deepgram-key', language='en-US')) @@ -165,7 +168,7 @@ The `with_avatar()` call validates against the currently configured TTS. Always # Correct order: TTS first, then avatar agent = ( Agent(name='my-agent') - .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', sample_rate=24000)) + .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_avatar(HeyGenAvatar(api_key='your-heygen-key', quality='medium', agora_uid='2')) ) ``` diff --git a/docs/guides/byok.md b/docs/guides/byok.md index ad60663..a0c0d7e 100644 --- a/docs/guides/byok.md +++ b/docs/guides/byok.md @@ -54,6 +54,7 @@ def main() -> None: key=os.environ["ELEVENLABS_API_KEY"], model_id="eleven_flash_v2_5", voice_id=os.environ["ELEVENLABS_VOICE_ID"], + base_url="wss://api.elevenlabs.io/v1", sample_rate=24000, ) ) diff --git a/docs/guides/cascading-flow.md b/docs/guides/cascading-flow.md index d919a48..f753062 100644 --- a/docs/guides/cascading-flow.md +++ b/docs/guides/cascading-flow.md @@ -32,7 +32,7 @@ agent = ( model='gpt-4o-mini', system_messages=[{'role': 'system', 'content': 'You are a friendly customer support agent.'}], )) - .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', sample_rate=24000)) + .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='your-deepgram-key', language='en-US', model='nova-2')) ) @@ -63,7 +63,7 @@ async def main(): model='gpt-4o-mini', system_messages=[{'role': 'system', 'content': 'You are a friendly customer support agent.'}], )) - .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', sample_rate=24000)) + .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='your-deepgram-key', language='en-US', model='nova-2')) ) diff --git a/docs/reference/agent.md b/docs/reference/agent.md index a1205fe..35eacce 100644 --- a/docs/reference/agent.md +++ b/docs/reference/agent.md @@ -70,7 +70,7 @@ Set the TTS vendor. Records the vendor's `sample_rate` for avatar validation. ```python from agora_agent import ElevenLabsTTS -agent = Agent().with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id')) +agent = Agent().with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1')) ``` ### `with_stt(vendor: BaseSTT) -> Agent` diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index 4747af2..ba9cacb 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -154,7 +154,7 @@ The SDK also includes named helpers for the remaining Agora-supported LLM provid | `key` | `str` | Yes | — | ElevenLabs API key | | `model_id` | `str` | Yes | — | Model ID (e.g., `eleven_flash_v2_5`) | | `voice_id` | `str` | Yes | — | Voice ID | -| `base_url` | `str` | No | `None` | Custom WebSocket base URL | +| `base_url` | `str` | Yes | — | WebSocket base URL | | `sample_rate` | `int` | No | `None` | Sample rate: 16000, 22050, 24000, or 44100 Hz | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | | `optimize_streaming_latency` | `int` | No | `None` | Latency optimization level (0–4) | @@ -192,7 +192,7 @@ Fixed sample rate: 24000 Hz. |---|---|---|---|---| | `api_key` | `str` | Yes | — | Cartesia API key | | `voice_id` | `str` | Yes | — | Voice ID (serialized as `{"mode": "id", "id": "..."}`) | -| `model_id` | `str` | No | `None` | Model ID | +| `model_id` | `str` | Yes | — | Model ID | | `sample_rate` | `int` | No | `None` | Sample rate: 8000–48000 Hz | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | @@ -213,6 +213,7 @@ Fixed sample rate: 24000 Hz. | `secret_key` | `str` | Yes | — | AWS secret key | | `region` | `str` | Yes | — | AWS region (e.g., `us-east-1`) | | `voice_id` | `str` | Yes | — | Amazon Polly voice ID | +| `engine` | `str` | Yes | — | Amazon Polly engine type | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | ### `DeepgramTTS` @@ -231,7 +232,12 @@ Fixed sample rate: 24000 Hz. | Parameter | Type | Required | Default | Description | |---|---|---|---|---| | `key` | `str` | Yes | — | Hume AI API key | +| `voice_id` | `str` | Yes | — | Hume AI voice ID | +| `provider` | `str` | Yes | — | Voice provider type, such as `CUSTOM_VOICE` or `HUME_AI` | | `config_id` | `str` | No | `None` | Configuration ID | +| `base_url` | `str` | No | `None` | Base URL | +| `speed` | `float` | No | `None` | Playback speed | +| `trailing_silence` | `float` | No | `None` | Trailing silence in seconds | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | ### `RimeTTS` @@ -240,10 +246,8 @@ Fixed sample rate: 24000 Hz. |---|---|---|---|---| | `key` | `str` | Yes | — | Rime API key | | `speaker` | `str` | Yes | — | Speaker ID | -| `model_id` | `str` | No | `None` | Model ID | -| `lang` | `str` | No | `None` | Language code | -| `sampling_rate` | `int` | No | `None` | Sampling rate in Hz (serialized as `samplingRate`) | -| `speed_alpha` | `float` | No | `None` | Speed multiplier (serialized as `speedAlpha`) | +| `model_id` | `str` | Yes | — | Model ID | +| `base_url` | `str` | No | `None` | WebSocket URL | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | ### `FishAudioTTS` @@ -252,6 +256,7 @@ Fixed sample rate: 24000 Hz. |---|---|---|---|---| | `key` | `str` | Yes | — | Fish Audio API key | | `reference_id` | `str` | Yes | — | Reference ID | +| `backend` | `str` | Yes | — | Backend model version | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | ### `MiniMaxTTS` @@ -270,8 +275,14 @@ Fixed sample rate: 24000 Hz. | Parameter | Type | Required | Default | Description | |---|---|---|---|---| | `key` | `str` | Yes | — | Murf API key | -| `voice_id` | `str` | Yes | — | Voice ID (e.g., `Ariana`, `Natalie`) | +| `voice_id` | `str` | No | `None` | Voice ID (e.g., `Ariana`, `Natalie`) | +| `base_url` | `str` | No | `None` | WebSocket endpoint | | `style` | `str` | No | `None` | Voice style (e.g., `Conversational`) | +| `locale` | `str` | No | `None` | Voice locale | +| `rate` | `float` | No | `None` | Speech rate | +| `pitch` | `float` | No | `None` | Pitch adjustment | +| `model` | `str` | No | `None` | TTS model | +| `sample_rate` | `int` | No | `None` | Audio sample rate | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | ### `SarvamTTS` diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index bc1f803..a5912c0 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -281,7 +281,7 @@ class Agent: >>> agent = ( ... agent ... .with_llm(OpenAI(api_key="...", model="gpt-4")) - ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", sample_rate=24000)) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1", sample_rate=24000)) ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) ... ) """ diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py index a749d1e..a4cac72 100644 --- a/src/agora_agent/agentkit/agent_session.py +++ b/src/agora_agent/agentkit/agent_session.py @@ -416,7 +416,7 @@ class AgentSession(_AgentSessionBase): >>> >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") - >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1")) >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) >>> agent_id = session.start() >>> session.say("Hello!") @@ -737,7 +737,7 @@ class AsyncAgentSession(_AgentSessionBase): >>> >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") >>> agent = Agent(name="assistant", instructions="You are helpful.") - >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1")) >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) >>> agent_id = await session.start() >>> await session.say("Hello!") From 0297a70e378f32a876e92e3200cf1e20d4341936 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Mon, 1 Jun 2026 22:49:24 -0400 Subject: [PATCH 12/26] Update AgentKit v2.1 provider docs and examples --- README.md | 1 + docs/concepts/agent.md | 3 +++ docs/concepts/session.md | 1 + docs/concepts/vendors.md | 12 ++++++------ docs/guides/agent-builder-features.md | 15 +++++++++------ docs/guides/avatars.md | 3 +++ docs/guides/byok.md | 1 + docs/guides/cascading-flow.md | 4 ++++ docs/reference/agent.md | 2 +- docs/reference/vendors.md | 23 +++++++++++++++-------- src/agora_agent/agentkit/agent.py | 2 +- src/agora_agent/agentkit/agent_session.py | 4 ++-- 12 files changed, 47 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index cd43021..9a80c3d 100644 --- a/README.md +++ b/README.md @@ -140,6 +140,7 @@ agent = Agent().with_stt( ).with_llm( OpenAI( api_key=os.environ["OPENAI_API_KEY"], + base_url="https://api.openai.com/v1/chat/completions", model="gpt-4o-mini", system_messages=[{"role": "system", "content": AGENT_PROMPT}], greeting_message=GREETING, diff --git a/docs/concepts/agent.md b/docs/concepts/agent.md index b45a1b7..1122c59 100644 --- a/docs/concepts/agent.md +++ b/docs/concepts/agent.md @@ -17,6 +17,7 @@ from agora_agent import Agent, OpenAI agent = Agent(name='support-assistant').with_llm( OpenAI( api_key='your-openai-key', + base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini', system_messages=[{'role': 'system', 'content': 'You are a helpful voice assistant.'}], greeting_message='Hello! How can I help you?', @@ -85,6 +86,7 @@ agent = ( Agent(name='my-agent') .with_llm(OpenAI( api_key='your-openai-key', + base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini', system_messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}], )) @@ -107,6 +109,7 @@ base = ( Agent() .with_llm(OpenAI( api_key='your-openai-key', + base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini', system_messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}], )) diff --git a/docs/concepts/session.md b/docs/concepts/session.md index ae43bc6..8d70add 100644 --- a/docs/concepts/session.md +++ b/docs/concepts/session.md @@ -43,6 +43,7 @@ agent = ( Agent(name='my-agent') .with_llm(OpenAI( api_key='your-openai-key', + base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini', system_messages=[{'role': 'system', 'content': 'You are helpful.'}], )) diff --git a/docs/concepts/vendors.md b/docs/concepts/vendors.md index 115c0ca..8ee4e9f 100644 --- a/docs/concepts/vendors.md +++ b/docs/concepts/vendors.md @@ -23,7 +23,7 @@ Used with `agent.with_llm()` for the cascading flow (ASR → LLM → TTS). |---|---|---| | `OpenAI` | OpenAI | `api_key` | | `AzureOpenAI` | Azure OpenAI | `api_key`, `endpoint`, `deployment_name` | -| `Anthropic` | Anthropic | `api_key` | +| `Anthropic` | Anthropic | `api_key`, `url`, `headers`, `max_tokens` | | `Gemini` | Google Gemini | `api_key` | | `Groq` | Groq | `api_key` | | `VertexAILLM` | Google Vertex AI | `api_key`, `project_id`, `location` | @@ -35,7 +35,7 @@ Used with `agent.with_llm()` for the cascading flow (ASR → LLM → TTS). ```python from agora_agent import OpenAI -llm = OpenAI(api_key='your-openai-key', model='gpt-4o-mini') +llm = OpenAI(api_key='your-openai-key', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini') ``` ## TTS Vendors @@ -81,11 +81,11 @@ Use `agent.with_interaction_language()` for Agora `asr.language`; it defaults to |---|---|---| | `SpeechmaticsSTT` | Speechmatics | `api_key`, `language` | | `DeepgramSTT` | Deepgram | — (all optional) | -| `MicrosoftSTT` | Microsoft Azure | `key`, `region` | +| `MicrosoftSTT` | Microsoft Azure | `key`, `region`, `language` | | `OpenAISTT` | OpenAI | `api_key` | -| `GoogleSTT` | Google Cloud | `project_id`, `location`, `adc_credentials_string` | -| `AmazonSTT` | Amazon Transcribe | `access_key`, `secret_key`, `region` | -| `AssemblyAISTT` | AssemblyAI | `api_key` | +| `GoogleSTT` | Google Cloud | `project_id`, `location`, `adc_credentials_string`, `language` | +| `AmazonSTT` | Amazon Transcribe | `access_key`, `secret_key`, `region`, `language` | +| `AssemblyAISTT` | AssemblyAI | `api_key`, `language` | | `AresSTT` | Ares | — (all optional) | | `SarvamSTT` | Sarvam | `api_key`, `language` | diff --git a/docs/guides/agent-builder-features.md b/docs/guides/agent-builder-features.md index e356d2b..731cb6e 100644 --- a/docs/guides/agent-builder-features.md +++ b/docs/guides/agent-builder-features.md @@ -53,6 +53,7 @@ agent = ( )) .with_llm(OpenAI( api_key='your-key', + base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini', system_messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}], )) @@ -108,7 +109,7 @@ agent = ( ), data_channel=DataChannel.RTM, # or DataChannel.DATASTREAM )) - .with_llm(OpenAI(api_key='...', model='gpt-4o-mini')) + .with_llm(OpenAI(api_key='...', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini')) .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='...', model='nova-2')) ) @@ -121,6 +122,7 @@ agent = ( Agent() .with_llm(OpenAI( api_key='...', + base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini', failure_message='Something went wrong.', max_history=15, @@ -140,7 +142,7 @@ from agora_agent import Agent, GeofenceConfig, GeofenceArea, GeofenceExcludeArea agent = ( Agent() .with_geofence(GeofenceConfig(area=GeofenceArea.NORTH_AMERICA)) - .with_llm(OpenAI(api_key='...', model='gpt-4o-mini')) + .with_llm(OpenAI(api_key='...', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini')) .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='...', model='nova-2')) ) @@ -149,7 +151,7 @@ agent = ( agent = ( Agent() .with_geofence(GeofenceConfig(area=GeofenceArea.GLOBAL, exclude_area=GeofenceExcludeArea.EUROPE)) - .with_llm(OpenAI(api_key='...', model='gpt-4o-mini')) + .with_llm(OpenAI(api_key='...', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini')) .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='...', model='nova-2')) ) @@ -169,7 +171,7 @@ agent = ( 'team': 'support', 'version': '1.2.0', }) - .with_llm(OpenAI(api_key='...', model='gpt-4o-mini')) + .with_llm(OpenAI(api_key='...', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini')) .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='...', model='nova-2')) ) @@ -188,7 +190,7 @@ agent = ( encryption_key='your-32-byte-key', encryption_mode=5, # AES_128_GCM )) - .with_llm(OpenAI(api_key='...', model='gpt-4o-mini')) + .with_llm(OpenAI(api_key='...', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini')) .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='...', model='nova-2')) ) @@ -225,7 +227,7 @@ agent = ( ), ), )) - .with_llm(OpenAI(api_key='...', model='gpt-4o-mini')) + .with_llm(OpenAI(api_key='...', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini')) .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='...', model='nova-2')) ) @@ -289,6 +291,7 @@ agent = ( Agent(name='full-featured-assistant') .with_llm(OpenAI( api_key='your-key', + base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini', system_messages=[{'role': 'system', 'content': 'You are a helpful voice assistant.'}], greeting_message='Hello! How can I help?', diff --git a/docs/guides/avatars.md b/docs/guides/avatars.md index 978f6b2..c370b80 100644 --- a/docs/guides/avatars.md +++ b/docs/guides/avatars.md @@ -57,6 +57,7 @@ agent = ( Agent(name='avatar-agent') .with_llm(OpenAI( api_key='your-openai-key', + base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini', system_messages=[{'role': 'system', 'content': 'You are a helpful assistant with a visual avatar.'}], )) @@ -108,6 +109,7 @@ agent = ( Agent(name='akool-agent') .with_llm(OpenAI( api_key='your-openai-key', + base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini', system_messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}], )) @@ -137,6 +139,7 @@ agent = ( Agent(name='broken-agent') .with_llm(OpenAI( api_key='your-openai-key', + base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini', system_messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}], )) diff --git a/docs/guides/byok.md b/docs/guides/byok.md index a0c0d7e..9a66414 100644 --- a/docs/guides/byok.md +++ b/docs/guides/byok.md @@ -43,6 +43,7 @@ def main() -> None: .with_llm( OpenAI( api_key=os.environ["OPENAI_API_KEY"], + base_url="https://api.openai.com/v1/chat/completions", model="gpt-4o-mini", system_messages=[{"role": "system", "content": "You are a concise support voice assistant."}], greeting_message="Hello! How can I help you today?", diff --git a/docs/guides/cascading-flow.md b/docs/guides/cascading-flow.md index f753062..45d44ce 100644 --- a/docs/guides/cascading-flow.md +++ b/docs/guides/cascading-flow.md @@ -29,6 +29,7 @@ agent = ( Agent(name='assistant') .with_llm(OpenAI( api_key='your-openai-key', + base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini', system_messages=[{'role': 'system', 'content': 'You are a friendly customer support agent.'}], )) @@ -60,6 +61,7 @@ async def main(): Agent(name='assistant') .with_llm(OpenAI( api_key='your-openai-key', + base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini', system_messages=[{'role': 'system', 'content': 'You are a friendly customer support agent.'}], )) @@ -125,6 +127,7 @@ from agora_agent import OpenAI llm = OpenAI( api_key='your-openai-key', + base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini', temperature=0.7, top_p=0.9, @@ -139,6 +142,7 @@ Configure greetings on the LLM vendor so message ownership stays with the LLM co ```python agent = Agent(name='greeter').with_llm(OpenAI( api_key='your-openai-key', + base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini', system_messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}], greeting_message='Hi there! What can I do for you?', diff --git a/docs/reference/agent.md b/docs/reference/agent.md index 35eacce..9094ba5 100644 --- a/docs/reference/agent.md +++ b/docs/reference/agent.md @@ -60,7 +60,7 @@ Set the LLM vendor for cascading flow. ```python from agora_agent import OpenAI -agent = Agent().with_llm(OpenAI(api_key='your-key', model='gpt-4o-mini')) +agent = Agent().with_llm(OpenAI(api_key='your-key', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini')) ``` ### `with_tts(vendor: BaseTTS) -> Agent` diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index ba9cacb..0044044 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -43,7 +43,7 @@ from agora_agent import OpenAI, ElevenLabsTTS, DeepgramTTS, DeepgramSTT, OpenAIR ```python from agora_agent import OpenAI -llm = OpenAI(api_key='your-key', model='gpt-4o-mini', temperature=0.7) +llm = OpenAI(api_key='your-key', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini', temperature=0.7) ``` ### `AzureOpenAI` @@ -84,7 +84,9 @@ llm = AzureOpenAI( |---|---|---|---|---| | `api_key` | `str` | Yes | — | Anthropic API key | | `model` | `str` | No | `claude-3-5-sonnet-20241022` | Model name | -| `max_tokens` | `int` | No | `None` | Maximum tokens | +| `url` | `str` | Yes | — | Anthropic messages endpoint URL | +| `headers` | `Dict[str, str]` | Yes | — | Request headers, including Anthropic API version | +| `max_tokens` | `int` | Yes | — | Maximum tokens | | `temperature` | `float` | No | `None` | Sampling temperature (0.0–1.0) | | `top_p` | `float` | No | `None` | Nucleus sampling (0.0–1.0) | | `system_messages` | `List[Dict]` | No | `None` | System messages | @@ -93,7 +95,6 @@ llm = AzureOpenAI( | `input_modalities` | `List[str]` | No | `None` | Input modalities | | `output_modalities` | `List[str]` | No | `None` | Output modalities | | `params` | `Dict[str, Any]` | No | `None` | Additional model parameters | -| `headers` | `Dict[str, str]` | No | `None` | Custom HTTP headers forwarded to the LLM provider | | `greeting_configs` | `Dict[str, Any]` | No | `None` | Greeting playback configuration | | `template_variables` | `Dict[str, str]` | No | `None` | Template variables for messages | @@ -101,7 +102,13 @@ llm = AzureOpenAI( ```python from agora_agent import Anthropic -llm = Anthropic(api_key='your-anthropic-key', model='claude-3-5-sonnet-20241022') +llm = Anthropic( + api_key='your-anthropic-key', + url='https://api.anthropic.com/v1/messages', + headers={'anthropic-version': '2023-06-01'}, + model='claude-3-5-sonnet-20241022', + max_tokens=1024, +) ``` ### `Gemini` @@ -328,7 +335,7 @@ Use `agent.with_interaction_language()` for Agora `asr.language`; it defaults to |---|---|---|---|---| | `key` | `str` | Yes | — | Azure subscription key | | `region` | `str` | Yes | — | Azure region (e.g., `eastus`) | -| `language` | `str` | No | `None` | Language code (e.g., `en-US`) | +| `language` | `str` | Yes | — | Language code (e.g., `en-US`) | | `interaction_language` | `str` | No | `None` | Agora `asr.language` override | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | @@ -351,7 +358,7 @@ Use `agent.with_interaction_language()` for Agora `asr.language`; it defaults to | `project_id` | `str` | Yes | — | Google Cloud project ID | | `location` | `str` | Yes | — | Google Cloud region | | `adc_credentials_string` | `str` | Yes | — | Google service account credentials JSON string | -| `language` | `str` | No | `None` | Language code (e.g., `en-US`) | +| `language` | `str` | Yes | — | Language code (e.g., `en-US`) | | `interaction_language` | `str` | No | `None` | Agora `asr.language` override | | `model` | `str` | No | `None` | Recognition model | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | @@ -363,7 +370,7 @@ Use `agent.with_interaction_language()` for Agora `asr.language`; it defaults to | `access_key` | `str` | Yes | — | AWS Access Key ID | | `secret_key` | `str` | Yes | — | AWS Secret Access Key | | `region` | `str` | Yes | — | AWS region (e.g., `us-east-1`) | -| `language` | `str` | No | `None` | Amazon `language_code` | +| `language` | `str` | Yes | — | Amazon `language_code` | | `interaction_language` | `str` | No | `None` | Agora `asr.language` override | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | @@ -372,7 +379,7 @@ Use `agent.with_interaction_language()` for Agora `asr.language`; it defaults to | Parameter | Type | Required | Default | Description | |---|---|---|---|---| | `api_key` | `str` | Yes | — | AssemblyAI API key | -| `language` | `str` | No | `None` | Language code | +| `language` | `str` | Yes | — | Language code | | `interaction_language` | `str` | No | `None` | Agora `asr.language` override | | `uri` | `str` | No | `None` | AssemblyAI streaming WebSocket URL | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index a5912c0..b4f9fdd 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -280,7 +280,7 @@ class Agent: >>> agent = Agent(instructions="You are a helpful voice assistant.") >>> agent = ( ... agent - ... .with_llm(OpenAI(api_key="...", model="gpt-4")) + ... .with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")) ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1", sample_rate=24000)) ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) ... ) diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py index a4cac72..e113dc1 100644 --- a/src/agora_agent/agentkit/agent_session.py +++ b/src/agora_agent/agentkit/agent_session.py @@ -416,7 +416,7 @@ class AgentSession(_AgentSessionBase): >>> >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") - >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1")) + >>> agent = agent.with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1")) >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) >>> agent_id = session.start() >>> session.say("Hello!") @@ -737,7 +737,7 @@ class AsyncAgentSession(_AgentSessionBase): >>> >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") >>> agent = Agent(name="assistant", instructions="You are helpful.") - >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1")) + >>> agent = agent.with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1")) >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) >>> agent_id = await session.start() >>> await session.say("Hello!") From 96afe786d3a3652d6180c15aa384b6b630665a1c Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Mon, 1 Jun 2026 22:52:52 -0400 Subject: [PATCH 13/26] align v2.1 provider docs with AgentKit validation --- docs/concepts/vendors.md | 2 +- docs/reference/vendors.md | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/concepts/vendors.md b/docs/concepts/vendors.md index 8ee4e9f..890ca09 100644 --- a/docs/concepts/vendors.md +++ b/docs/concepts/vendors.md @@ -46,7 +46,7 @@ Used with `agent.with_tts()`. Each TTS vendor produces audio at a specific sampl |---|---|---|---| | `ElevenLabsTTS` | ElevenLabs | `key`, `model_id`, `voice_id`, `base_url` | 16000, 22050, 24000, or 44100 Hz | | `MicrosoftTTS` | Microsoft Azure | `key`, `region`, `voice_name` | 8000, 16000, 24000, or 48000 Hz | -| `OpenAITTS` | OpenAI | `key`, `voice` | 24000 Hz (fixed) | +| `OpenAITTS` | OpenAI | `voice` for Agora-managed `tts-1`; `api_key`, `model`, `base_url`, `voice` for BYOK | 24000 Hz (fixed) | | `CartesiaTTS` | Cartesia | `api_key`, `voice_id`, `model_id` | 8000–48000 Hz | | `GoogleTTS` | Google Cloud | `key`, `voice_name` | — | | `AmazonTTS` | Amazon Polly | `access_key`, `secret_key`, `region`, `voice_id`, `engine` | — | diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index 0044044..4604ca6 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -184,14 +184,15 @@ The SDK also includes named helpers for the remaining Agora-supported LLM provid | Parameter | Type | Required | Default | Description | |---|---|---|---|---| -| `api_key` | `str` | Yes | — | OpenAI API key | +| `api_key` | `str` | BYOK only | `None` | OpenAI API key | | `voice` | `str` | Yes | — | Voice: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer` | -| `model` | `str` | No | `None` | Model: `tts-1` or `tts-1-hd` | +| `model` | `str` | BYOK only | `None` | Model: `tts-1` or `tts-1-hd` | +| `base_url` | `str` | BYOK only | `None` | OpenAI TTS endpoint URL | | `response_format` | `str` | No | `None` | Audio format (e.g., `pcm`) | | `speed` | `float` | No | `None` | Speech speed multiplier | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | -Fixed sample rate: 24000 Hz. +`api_key`, `model`, and `base_url` are required together for BYOK. Without `api_key`, AgentKit uses the Agora-managed `tts-1` path. Fixed sample rate: 24000 Hz. ### `CartesiaTTS` From 434c8af12c56b4a314397adc046f4eb1a75c6614 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Mon, 1 Jun 2026 23:05:49 -0400 Subject: [PATCH 14/26] Align AgentKit LLM and ASR vendor validation --- docs/concepts/vendors.md | 14 ++++++------ docs/reference/vendors.md | 15 ++++++------- src/agora_agent/agentkit/agent.py | 14 ++++++++++-- src/agora_agent/agentkit/vendors/llm.py | 11 ++++++---- src/agora_agent/agentkit/vendors/tts.py | 3 --- tests/custom/test_llm_vendors.py | 29 ++++++++++++++++++++++++- tests/custom/test_stt_language.py | 10 +++++++++ 7 files changed, 71 insertions(+), 25 deletions(-) diff --git a/docs/concepts/vendors.md b/docs/concepts/vendors.md index 890ca09..42e3fcd 100644 --- a/docs/concepts/vendors.md +++ b/docs/concepts/vendors.md @@ -21,14 +21,14 @@ Used with `agent.with_llm()` for the cascading flow (ASR → LLM → TTS). | Class | Provider | Required Parameters | |---|---|---| -| `OpenAI` | OpenAI | `api_key` | +| `OpenAI` | OpenAI | `model` for Agora-managed models; `api_key`, `base_url`, `model` for BYOK | | `AzureOpenAI` | Azure OpenAI | `api_key`, `endpoint`, `deployment_name` | -| `Anthropic` | Anthropic | `api_key`, `url`, `headers`, `max_tokens` | -| `Gemini` | Google Gemini | `api_key` | -| `Groq` | Groq | `api_key` | -| `VertexAILLM` | Google Vertex AI | `api_key`, `project_id`, `location` | -| `AmazonBedrock` | Amazon Bedrock | `api_key`, `url`, `model` | -| `Dify` | Dify | `api_key`, `url` | +| `Anthropic` | Anthropic | `api_key`, `model`, `url`, `headers`, `max_tokens` | +| `Gemini` | Google Gemini | `api_key`, `model` | +| `Groq` | Groq | `api_key`, `model`, `base_url` | +| `VertexAILLM` | Google Vertex AI | `api_key`, `model`, `project_id`, `location` | +| `AmazonBedrock` | Amazon Bedrock | `access_key`, `secret_key`, `region`, `model` | +| `Dify` | Dify | `api_key`, `url`, `model` | | `CustomLLM` | OpenAI-compatible LLM | `api_key`, `base_url`, `model` | diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index 4604ca6..da40142 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -23,9 +23,9 @@ from agora_agent import OpenAI, ElevenLabsTTS, DeepgramTTS, DeepgramSTT, OpenAIR | Parameter | Type | Required | Default | Description | |---|---|---|---|---| -| `api_key` | `str` | Yes | — | OpenAI API key | -| `model` | `str` | No | `gpt-4o-mini` | Model name | -| `base_url` | `str` | No | `None` | Custom base URL (overrides default OpenAI endpoint) | +| `api_key` | `str` | BYOK only | `None` | OpenAI API key. Optional for supported Agora-managed OpenAI models. | +| `model` | `str` | Yes | — | Model name | +| `base_url` | `str` | BYOK only | `None` | OpenAI Chat Completions endpoint URL. Required when `api_key` is set. | | `temperature` | `float` | No | `None` | Sampling temperature (0.0–2.0) | | `top_p` | `float` | No | `None` | Nucleus sampling (0.0–1.0) | | `max_tokens` | `int` | No | `None` | Maximum tokens to generate | @@ -83,7 +83,7 @@ llm = AzureOpenAI( | Parameter | Type | Required | Default | Description | |---|---|---|---|---| | `api_key` | `str` | Yes | — | Anthropic API key | -| `model` | `str` | No | `claude-3-5-sonnet-20241022` | Model name | +| `model` | `str` | Yes | — | Model name | | `url` | `str` | Yes | — | Anthropic messages endpoint URL | | `headers` | `Dict[str, str]` | Yes | — | Request headers, including Anthropic API version | | `max_tokens` | `int` | Yes | — | Maximum tokens | @@ -116,7 +116,7 @@ llm = Anthropic( | Parameter | Type | Required | Default | Description | |---|---|---|---|---| | `api_key` | `str` | Yes | — | Google AI API key | -| `model` | `str` | No | `gemini-2.0-flash-exp` | Model name | +| `model` | `str` | Yes | — | Model name | | `temperature` | `float` | No | `None` | Sampling temperature (0.0–2.0) | | `top_p` | `float` | No | `None` | Nucleus sampling (0.0–1.0) | | `top_k` | `int` | No | `None` | Top-k sampling | @@ -146,8 +146,8 @@ The SDK also includes named helpers for the remaining Agora-supported LLM provid |---|---|---| | `Groq` | Groq | `api_key`, `model`, `base_url?` | | `VertexAILLM` | Google Vertex AI | `api_key`, `model`, `project_id`, `location`, `url?` | -| `AmazonBedrock` | Amazon Bedrock | `api_key`, `url`, `model` | -| `Dify` | Dify | `api_key`, `url`, `user?`, `conversation_id?` | +| `AmazonBedrock` | Amazon Bedrock | `access_key`, `secret_key`, `region`, `model` | +| `Dify` | Dify | `api_key`, `url`, `model`, `user?`, `conversation_id?` | | `CustomLLM` | OpenAI-compatible LLM | `api_key`, `model`, `base_url` | --- @@ -285,7 +285,6 @@ The SDK also includes named helpers for the remaining Agora-supported LLM provid | `key` | `str` | Yes | — | Murf API key | | `voice_id` | `str` | No | `None` | Voice ID (e.g., `Ariana`, `Natalie`) | | `base_url` | `str` | No | `None` | WebSocket endpoint | -| `style` | `str` | No | `None` | Voice style (e.g., `Conversational`) | | `locale` | `str` | No | `None` | Voice locale | | `rate` | `float` | No | `None` | Speech rate | | `pitch` | `float` | No | `None` | Pitch adjustment | diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index b4f9fdd..0ab0f5b 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -261,6 +261,12 @@ def _is_interaction_language(value: typing.Any) -> bool: return isinstance(value, str) and value in _INTERACTION_LANGUAGES +def _validate_interaction_language(value: typing.Any) -> InteractionLanguage: + if not _is_interaction_language(value): + raise ValueError(f"Invalid interaction language: {value}") + return value # type: ignore[return-value] + + class Agent: """A reusable agent definition. @@ -322,7 +328,11 @@ def __init__( self._sal = sal self._advanced_features = advanced_features self._parameters = parameters - self._interaction_language = interaction_language + self._interaction_language = ( + _validate_interaction_language(interaction_language) + if interaction_language is not None + else None + ) self._geofence = geofence self._labels = labels self._rtc = rtc @@ -363,7 +373,7 @@ def with_interaction_language(self, language: InteractionLanguage) -> "Agent": remain under ``asr.params``, for example ``asr.params.language``. """ new_agent = self._clone() - new_agent._interaction_language = language + new_agent._interaction_language = _validate_interaction_language(language) return new_agent def with_mllm(self, vendor: BaseMLLM) -> "Agent": diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py index ee6d3af..55b73ab 100644 --- a/src/agora_agent/agentkit/vendors/llm.py +++ b/src/agora_agent/agentkit/vendors/llm.py @@ -29,7 +29,7 @@ class OpenAIOptions(BaseModel): model_config = ConfigDict(extra="forbid") api_key: Optional[str] = Field(default=None, description="OpenAI API key") - model: str = Field(default="gpt-4o-mini", description="Model name") + model: str = Field(..., description="Model name") base_url: Optional[str] = Field(default=None, description="Custom base URL") temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) @@ -49,6 +49,8 @@ class OpenAIOptions(BaseModel): @model_validator(mode="after") def _validate_byok_params(self) -> "OpenAIOptions": + if not self.model: + raise ValueError("OpenAI requires model") if self.api_key is not None and self.base_url is None: raise ValueError("OpenAI requires base_url when api_key is set") if self.api_key is None and self.base_url is not None: @@ -184,7 +186,7 @@ class AnthropicOptions(BaseModel): model_config = ConfigDict(extra="forbid") api_key: str = Field(..., description="Anthropic API key") - model: str = Field(default="claude-3-5-sonnet-20241022", description="Model name") + model: str = Field(..., description="Model name") url: str = Field(..., description="Anthropic messages endpoint URL") max_tokens: int = Field(..., gt=0) temperature: Optional[float] = Field(default=None, ge=0.0, le=1.0) @@ -251,7 +253,7 @@ class GeminiOptions(BaseModel): model_config = ConfigDict(extra="forbid") api_key: str = Field(..., description="Google AI API key") - model: str = Field(default="gemini-2.0-flash-exp", description="Model name") + model: str = Field(..., description="Model name") url: Optional[str] = Field(default=None, description="Custom API endpoint URL") temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) @@ -322,7 +324,7 @@ class GroqOptions(OpenAIOptions): model_config = ConfigDict(extra="forbid") api_key: str = Field(..., description="Groq API key") - model: str = Field(default="llama-3.3-70b-versatile", description="Model name") + model: str = Field(..., description="Model name") base_url: str = Field(..., description="Groq-compatible endpoint") @@ -383,6 +385,7 @@ class AmazonBedrockOptions(AnthropicOptions): access_key: str = Field(..., description="AWS access key ID") secret_key: str = Field(..., description="AWS secret access key") region: str = Field(..., description="AWS region") + model: str = Field(..., description="Amazon Bedrock model identifier") max_tokens: Optional[int] = Field(default=None, gt=0) api_key: Optional[str] = Field(default=None, description="Unused; kept for AnthropicOptions compatibility") url: Optional[str] = Field(default=None, description="Unused; kept for AnthropicOptions compatibility") diff --git a/src/agora_agent/agentkit/vendors/tts.py b/src/agora_agent/agentkit/vendors/tts.py index 61ceb2e..98dee4b 100644 --- a/src/agora_agent/agentkit/vendors/tts.py +++ b/src/agora_agent/agentkit/vendors/tts.py @@ -497,7 +497,6 @@ class MurfTTSOptions(BaseModel): key: str = Field(..., description="Murf API key") voice_id: Optional[str] = Field(default=None, description="Voice ID (e.g., 'Ariana', 'Natalie', 'Ken')") base_url: Optional[str] = Field(default=None, description="WebSocket endpoint") - style: Optional[str] = Field(default=None, description="Voice style (e.g., 'Angry', 'Sad', 'Conversational', 'Newscast')") locale: Optional[str] = Field(default=None, description="Voice locale") rate: Optional[float] = Field(default=None, description="Speech rate") pitch: Optional[float] = Field(default=None, description="Pitch adjustment") @@ -520,8 +519,6 @@ def to_config(self) -> Dict[str, Any]: params["base_url"] = self.options.base_url if self.options.voice_id is not None: params["voiceId"] = self.options.voice_id - if self.options.style is not None: - params["style"] = self.options.style if self.options.locale is not None: params["locale"] = self.options.locale if self.options.rate is not None: diff --git a/tests/custom/test_llm_vendors.py b/tests/custom/test_llm_vendors.py index bcde1bf..6b4f08c 100644 --- a/tests/custom/test_llm_vendors.py +++ b/tests/custom/test_llm_vendors.py @@ -1,4 +1,6 @@ -from agora_agent import AmazonBedrock, Anthropic, AzureOpenAI, CustomLLM, Dify, Groq, VertexAILLM +import pytest + +from agora_agent import AmazonBedrock, Anthropic, AzureOpenAI, CustomLLM, Dify, Gemini, Groq, OpenAI, VertexAILLM def test_groq_serializes_as_openai_compatible() -> None: @@ -95,3 +97,28 @@ def test_dify_serializes_conversation_fields() -> None: assert config["params"]["model"] == "default" assert config["params"]["user"] == "user-1" assert config["params"]["conversation_id"] == "conversation-1" + + +def test_llm_vendors_reject_missing_required_models() -> None: + with pytest.raises(Exception, match="model"): + OpenAI(api_key="openai-key", base_url="https://api.openai.com/v1/chat/completions") + + with pytest.raises(Exception, match="model"): + Anthropic( + api_key="anthropic-key", + url="https://api.anthropic.com/v1/messages", + headers={"anthropic-version": "2023-06-01"}, + max_tokens=1024, + ) + + with pytest.raises(Exception, match="model"): + Gemini(api_key="google-key") + + with pytest.raises(Exception, match="model"): + Groq(api_key="groq-key", base_url="https://api.groq.com/openai/v1/chat/completions") + + with pytest.raises(Exception, match="model"): + VertexAILLM(api_key="vertex-token", project_id="project", location="us-central1") + + with pytest.raises(Exception, match="model"): + AmazonBedrock(access_key="aws-access", secret_key="aws-secret", region="us-east-1") diff --git a/tests/custom/test_stt_language.py b/tests/custom/test_stt_language.py index 1ab6e80..e4d8cb5 100644 --- a/tests/custom/test_stt_language.py +++ b/tests/custom/test_stt_language.py @@ -1,3 +1,5 @@ +import pytest + from agora_agent import ( Agent, AmazonSTT, @@ -64,6 +66,14 @@ def test_explicit_interaction_language_can_differ_from_provider_language() -> No assert props["asr"]["params"]["language"] == "en" +def test_invalid_explicit_interaction_language_is_rejected() -> None: + with pytest.raises(ValueError, match="Invalid interaction language: en"): + Agent(interaction_language="en") # type: ignore[arg-type] + + with pytest.raises(ValueError, match="Invalid interaction language: xx-YY"): + base_agent().with_interaction_language("xx-YY") # type: ignore[arg-type] + + def test_default_interaction_language_is_sent_without_stt() -> None: props = properties(base_agent()) From 968e1f03fa63dddaf85491ff8881baefc576ad1f Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Mon, 1 Jun 2026 23:13:35 -0400 Subject: [PATCH 15/26] Restrict managed OpenAI LLM models in AgentKit --- docs/concepts/vendors.md | 2 +- docs/reference/vendors.md | 4 +++- src/agora_agent/agentkit/vendors/llm.py | 5 +++++ tests/custom/test_llm_vendors.py | 10 ++++++++++ 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/docs/concepts/vendors.md b/docs/concepts/vendors.md index 42e3fcd..6aa116e 100644 --- a/docs/concepts/vendors.md +++ b/docs/concepts/vendors.md @@ -22,7 +22,7 @@ Used with `agent.with_llm()` for the cascading flow (ASR → LLM → TTS). | Class | Provider | Required Parameters | |---|---|---| | `OpenAI` | OpenAI | `model` for Agora-managed models; `api_key`, `base_url`, `model` for BYOK | -| `AzureOpenAI` | Azure OpenAI | `api_key`, `endpoint`, `deployment_name` | +| `AzureOpenAI` | Azure OpenAI | `api_key`, `model`, `endpoint`, `deployment_name` | | `Anthropic` | Anthropic | `api_key`, `model`, `url`, `headers`, `max_tokens` | | `Gemini` | Google Gemini | `api_key`, `model` | | `Groq` | Groq | `api_key`, `model`, `base_url` | diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index da40142..abaa49e 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -51,6 +51,7 @@ llm = OpenAI(api_key='your-key', base_url='https://api.openai.com/v1/chat/comple | Parameter | Type | Required | Default | Description | |---|---|---|---|---| | `api_key` | `str` | Yes | — | Azure OpenAI API key | +| `model` | `str` | Yes | — | Deployment's base model name. Emitted as `params.model`. | | `endpoint` | `str` | Yes | — | Azure endpoint URL | | `deployment_name` | `str` | Yes | — | Azure deployment name | | `api_version` | `str` | No | `2024-08-01-preview` | Azure API version | @@ -73,6 +74,7 @@ from agora_agent import AzureOpenAI llm = AzureOpenAI( api_key='your-azure-key', + model='gpt-4o-mini', endpoint='https://your-resource.openai.azure.com', deployment_name='gpt-4o-mini', ) @@ -144,7 +146,7 @@ The SDK also includes named helpers for the remaining Agora-supported LLM provid | Class | Provider | Key parameters | |---|---|---| -| `Groq` | Groq | `api_key`, `model`, `base_url?` | +| `Groq` | Groq | `api_key`, `model`, `base_url` | | `VertexAILLM` | Google Vertex AI | `api_key`, `model`, `project_id`, `location`, `url?` | | `AmazonBedrock` | Amazon Bedrock | `access_key`, `secret_key`, `region`, `model` | | `Dify` | Dify | `api_key`, `url`, `model`, `user?`, `conversation_id?` | diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py index 55b73ab..ba3aa3e 100644 --- a/src/agora_agent/agentkit/vendors/llm.py +++ b/src/agora_agent/agentkit/vendors/llm.py @@ -5,6 +5,7 @@ from .base import BaseLLM LlmGreetingConfigs = Dict[str, Any] +_OPENAI_MANAGED_MODELS = {"gpt-4o-mini", "gpt-4.1-mini", "gpt-5-nano", "gpt-5-mini"} def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: @@ -55,6 +56,10 @@ def _validate_byok_params(self) -> "OpenAIOptions": raise ValueError("OpenAI requires base_url when api_key is set") if self.api_key is None and self.base_url is not None: raise ValueError("OpenAI base_url is only valid when api_key is set") + if self.api_key is None and self.model.strip().lower() not in _OPENAI_MANAGED_MODELS: + raise ValueError("OpenAI requires api_key unless using a supported Agora-managed model") + if self.api_key is None and self.vendor is not None: + raise ValueError("OpenAI Agora-managed mode does not allow vendor") return self class OpenAI(BaseLLM): diff --git a/tests/custom/test_llm_vendors.py b/tests/custom/test_llm_vendors.py index 6b4f08c..2861e45 100644 --- a/tests/custom/test_llm_vendors.py +++ b/tests/custom/test_llm_vendors.py @@ -122,3 +122,13 @@ def test_llm_vendors_reject_missing_required_models() -> None: with pytest.raises(Exception, match="model"): AmazonBedrock(access_key="aws-access", secret_key="aws-secret", region="us-east-1") + + +def test_openai_managed_mode_is_restricted_to_supported_models() -> None: + assert OpenAI(model="gpt-5-mini").to_config()["params"]["model"] == "gpt-5-mini" + + with pytest.raises(Exception, match="api_key"): + OpenAI(model="gpt-4o") + + with pytest.raises(Exception, match="does not allow vendor"): + OpenAI(model="gpt-5-mini", vendor="custom") From 676b93b39c541b2c5f6f5e438c8833b9655c2a7a Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Tue, 2 Jun 2026 02:39:22 -0400 Subject: [PATCH 16/26] Align managed vendor validation with generated core shapes --- docs/concepts/vendors.md | 4 ++-- docs/reference/vendors.md | 7 ++++--- src/agora_agent/agentkit/vendors/stt.py | 9 ++++++++- src/agora_agent/agentkit/vendors/tts.py | 13 ++++++++----- tests/custom/test_stt_language.py | 8 ++++++++ tests/custom/test_tts_vendors.py | 10 ++++++++++ 6 files changed, 40 insertions(+), 11 deletions(-) diff --git a/docs/concepts/vendors.md b/docs/concepts/vendors.md index 6aa116e..5c49e23 100644 --- a/docs/concepts/vendors.md +++ b/docs/concepts/vendors.md @@ -54,7 +54,7 @@ Used with `agent.with_tts()`. Each TTS vendor produces audio at a specific sampl | `RimeTTS` | Rime | `key`, `speaker`, `model_id` | — | | `FishAudioTTS` | Fish Audio | `key`, `reference_id`, `backend` | — | | `GroqTTS` | Groq | `key` | — | -| `MiniMaxTTS` | MiniMax | `key` | — | +| `MiniMaxTTS` | MiniMax | `model` for supported Agora-managed models; `key`, `group_id`, `model`, `voice_id`, `url` for BYOK | — | | `DeepgramTTS` | Deepgram | `api_key`, `model` | Configurable | | `SarvamTTS` | Sarvam | `api_key` | — | @@ -80,7 +80,7 @@ Use `agent.with_interaction_language()` for Agora `asr.language`; it defaults to | Class | Provider | Required Parameters | |---|---|---| | `SpeechmaticsSTT` | Speechmatics | `api_key`, `language` | -| `DeepgramSTT` | Deepgram | — (all optional) | +| `DeepgramSTT` | Deepgram | `model` for Agora-managed `nova-2`/`nova-3`; `api_key` for BYOK | | `MicrosoftSTT` | Microsoft Azure | `key`, `region`, `language` | | `OpenAISTT` | OpenAI | `api_key` | | `GoogleSTT` | Google Cloud | `project_id`, `location`, `adc_credentials_string`, `language` | diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index abaa49e..c63c1dd 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -190,11 +190,10 @@ The SDK also includes named helpers for the remaining Agora-supported LLM provid | `voice` | `str` | Yes | — | Voice: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer` | | `model` | `str` | BYOK only | `None` | Model: `tts-1` or `tts-1-hd` | | `base_url` | `str` | BYOK only | `None` | OpenAI TTS endpoint URL | -| `response_format` | `str` | No | `None` | Audio format (e.g., `pcm`) | | `speed` | `float` | No | `None` | Speech speed multiplier | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | -`api_key`, `model`, and `base_url` are required together for BYOK. Without `api_key`, AgentKit uses the Agora-managed `tts-1` path. Fixed sample rate: 24000 Hz. +`api_key`, `model`, and `base_url` are required together for BYOK. Without `api_key`, `model` must be omitted or set to the Agora-managed `tts-1` path. Fixed sample rate: 24000 Hz. ### `CartesiaTTS` @@ -323,7 +322,7 @@ Use `agent.with_interaction_language()` for Agora `asr.language`; it defaults to | Parameter | Type | Required | Default | Description | |---|---|---|---|---| -| `api_key` | `str` | No | `None` | Deepgram API key | +| `api_key` | `str` | BYOK only | `None` | Deepgram API key. Optional only for Agora-managed `nova-2` and `nova-3`. | | `model` | `str` | No | `None` | Model (e.g., `nova-2`) | | `language` | `str` | No | `None` | Language code (e.g., `en-US`) | | `interaction_language` | `str` | No | `None` | Agora `asr.language` override | @@ -331,6 +330,8 @@ Use `agent.with_interaction_language()` for Agora `asr.language`; it defaults to | `punctuation` | `bool` | No | `None` | Enable punctuation | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | +For `nova-2` and `nova-3`, omit `api_key` to use Agora-managed credentials. For all other Deepgram models, AgentKit requires `api_key`. + ### `MicrosoftSTT` | Parameter | Type | Required | Default | Description | diff --git a/src/agora_agent/agentkit/vendors/stt.py b/src/agora_agent/agentkit/vendors/stt.py index 48aa43b..47d94f1 100644 --- a/src/agora_agent/agentkit/vendors/stt.py +++ b/src/agora_agent/agentkit/vendors/stt.py @@ -1,6 +1,6 @@ from typing import Any, Dict, Optional -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, model_validator from typing_extensions import Literal from .base import BaseSTT @@ -41,6 +41,7 @@ ] _INTERACTION_LANGUAGES = set(InteractionLanguage.__args__) +_DEEPGRAM_MANAGED_MODELS = {"nova-2", "nova-3"} def _interaction_language(language: Optional[str], interaction_language: Optional[InteractionLanguage]) -> Optional[InteractionLanguage]: @@ -97,6 +98,12 @@ class DeepgramSTTOptions(BaseModel): punctuation: Optional[bool] = Field(default=None, description="Enable punctuation") additional_params: Optional[Dict[str, Any]] = Field(default=None) + @model_validator(mode="after") + def _validate_managed_model(self) -> "DeepgramSTTOptions": + if self.api_key is None and (self.model is None or self.model.strip().lower() not in _DEEPGRAM_MANAGED_MODELS): + raise ValueError("DeepgramSTT requires api_key unless using a supported Agora-managed model") + return self + class DeepgramSTT(BaseSTT): def __init__(self, **kwargs: Any): self.options = DeepgramSTTOptions(**kwargs) diff --git a/src/agora_agent/agentkit/vendors/tts.py b/src/agora_agent/agentkit/vendors/tts.py index 98dee4b..dc90222 100644 --- a/src/agora_agent/agentkit/vendors/tts.py +++ b/src/agora_agent/agentkit/vendors/tts.py @@ -3,6 +3,7 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator from .base import BaseTTS, CartesiaSampleRate, ElevenLabsSampleRate, GoogleTTSSampleRate, MicrosoftSampleRate +from ..presets import MiniMaxPresetModels, OpenAITtsPresetModels class ElevenLabsTTSOptions(BaseModel): model_config = ConfigDict(extra="forbid") @@ -100,7 +101,6 @@ class OpenAITTSOptions(BaseModel): voice: str = Field(..., description="Voice name (alloy, echo, fable, onyx, nova, shimmer)") model: Optional[str] = Field(default=None, description="Model name (tts-1, tts-1-hd)") base_url: Optional[str] = Field(default=None, description="Endpoint URL") - response_format: Optional[str] = Field(default=None, description="Audio format (e.g., pcm)") instructions: Optional[str] = Field(default=None, description="Custom voice instructions") speed: Optional[float] = Field(default=None, description="Speech speed multiplier") skip_patterns: Optional[List[int]] = Field(default=None) @@ -118,8 +118,11 @@ def _validate_byok_params(self) -> "OpenAITTSOptions": ] if missing: raise ValueError(f"OpenAITTS requires {', '.join(missing)} when api_key is set") - elif self.base_url is not None: - raise ValueError("OpenAITTS base_url is only valid when api_key is set") + else: + if self.model is not None and self.model.strip().lower() not in OpenAITtsPresetModels: + raise ValueError("OpenAITTS requires api_key unless using the Agora-managed tts-1 model") + if self.base_url is not None: + raise ValueError("OpenAITTS base_url is only valid when api_key is set") return self class OpenAITTS(BaseTTS): @@ -141,8 +144,6 @@ def to_config(self) -> Dict[str, Any]: elif self.options.model is not None: params["model"] = self.options.model - if self.options.response_format is not None: - params["response_format"] = self.options.response_format if self.options.instructions is not None: params["instructions"] = self.options.instructions if self.options.speed is not None: @@ -423,6 +424,8 @@ def _validate_byok_params(self) -> "MiniMaxTTSOptions": ] if missing: raise ValueError(f"MiniMaxTTS requires {', '.join(missing)} when key is set") + elif self.model.strip().lower() not in MiniMaxPresetModels: + raise ValueError("MiniMaxTTS requires key unless using a supported Agora-managed model") return self class MiniMaxTTS(BaseTTS): diff --git a/tests/custom/test_stt_language.py b/tests/custom/test_stt_language.py index e4d8cb5..6ebb484 100644 --- a/tests/custom/test_stt_language.py +++ b/tests/custom/test_stt_language.py @@ -81,6 +81,14 @@ def test_default_interaction_language_is_sent_without_stt() -> None: def test_stt_vendor_params_match_documented_shapes() -> None: + assert DeepgramSTT(model="nova-3", language="en-US").to_config()["params"] == { + "model": "nova-3", + "language": "en-US", + } + + with pytest.raises(Exception, match="api_key"): + DeepgramSTT(model="enhanced") + assert DeepgramSTT(api_key="dg-key", language="en").to_config()["params"] == { "key": "dg-key", "language": "en", diff --git a/tests/custom/test_tts_vendors.py b/tests/custom/test_tts_vendors.py index ec3bbbf..0930419 100644 --- a/tests/custom/test_tts_vendors.py +++ b/tests/custom/test_tts_vendors.py @@ -1,3 +1,5 @@ +import pytest + from agora_agent import AmazonTTS, CartesiaTTS, ElevenLabsTTS, FishAudioTTS, GoogleTTS, HumeAITTS, MiniMaxTTS, MurfTTS, OpenAITTS, RimeTTS, SarvamTTS @@ -97,3 +99,11 @@ def test_tts_vendor_params_match_generated_core_shapes() -> None: assert MurfTTS(key="murf-key").to_config()["params"] == { "api_key": "murf-key", } + + +def test_tts_managed_mode_validation_matches_core_shapes() -> None: + with pytest.raises(Exception, match="OpenAITTS requires api_key"): + OpenAITTS(voice="coral", model="tts-1-hd") + + with pytest.raises(Exception, match="MiniMaxTTS requires key"): + MiniMaxTTS(model="speech-02-turbo") From 8d52340e2ca0e087c2fad26b4401845d9531e1c2 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Tue, 2 Jun 2026 03:00:50 -0400 Subject: [PATCH 17/26] fix(agentkit): flatten Deepgram TTS passthrough params --- README.md | 36 +++---------------------- changelog.md | 2 +- docs/reference/vendors.md | 2 +- src/agora_agent/agentkit/vendors/tts.py | 9 +++---- tests/custom/test_tts_vendors.py | 10 ++++++- 5 files changed, 18 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index 9a80c3d..1fafba5 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ pip install agora-agents ## Quick Start Start with the `Agent` builder: create a client with app credentials, choose your ASR, LLM, and TTS providers, then start a session. Omit vendor API keys for supported Agora-managed models, or provide keys when you want BYOK. +Use `with_interaction_language()` for Agora `asr.language`; provider-specific STT language values remain under `asr.params`. ```python import os @@ -29,12 +30,9 @@ from agora_agent import ( Agent, Agora, Area, - DataChannel, DeepgramSTT, - GenericAvatar, MiniMaxTTS, OpenAI, - XaiGrok, expires_in_hours, ) @@ -56,35 +54,7 @@ def start_conversation() -> str: app_certificate=app_certificate, ) - agent = Agent( - name=f"conversation-{int(time.time())}", - turn_detection={ - "config": { - "speech_threshold": 0.5, - "start_of_speech": { - "mode": "vad", - "vad_config": { - "interrupt_duration_ms": 160, - "prefix_padding_ms": 300, - }, - }, - "end_of_speech": { - "mode": "vad", - "vad_config": { - "silence_duration_ms": 480, - }, - }, - }, - }, - advanced_features={ - "enable_rtm": True, - "enable_tools": True, - }, - parameters={ - "data_channel": DataChannel.RTM, - "enable_error_message": True, - }, - ).with_stt( + agent = Agent(name=f"conversation-{int(time.time())}").with_interaction_language("en-US").with_stt( DeepgramSTT( model="nova-3", language="en", @@ -131,7 +101,7 @@ def start_conversation() -> str: Use the same `Agent` builder shape, but provide credentials explicitly when you want vendor-managed billing and routing instead of Agora-managed models. ```python -agent = Agent().with_stt( +agent = Agent().with_interaction_language("en-US").with_stt( DeepgramSTT( api_key=os.environ["DEEPGRAM_API_KEY"], model="nova-3", diff --git a/changelog.md b/changelog.md index 1174850..efcc782 100644 --- a/changelog.md +++ b/changelog.md @@ -52,7 +52,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). ### Added -- **`DeepgramTTS`** — New TTS vendor wrapper for Deepgram (Beta). Accepts `api_key`, `model`, `base_url`, `sample_rate`, `params`, and `skip_patterns`. +- **`DeepgramTTS`** — New TTS vendor wrapper for Deepgram (Beta). Accepts `api_key`, `model`, `base_url`, `sample_rate`, `additional_params`, and `skip_patterns`. - **`Agent.with_tools(enabled=True)`** — Dedicated builder method to enable MCP tool invocation (`advanced_features.enable_tools`). Replaces the raw `with_advanced_features(AdvancedFeatures(enable_tools=True))` call. - **LLM vendors: `headers` field** — All four LLM vendors (`OpenAI`, `AzureOpenAI`, `Anthropic`, `Gemini`) now accept an optional `headers: Dict[str, str]` parameter. Use this to pass custom HTTP headers to the LLM provider (e.g., tenant identifiers, routing headers). - **`AgentSession.think()` / `AsyncAgentSession.think()`** — Send a custom instruction to a running agent through the `agent_management` API. diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index c63c1dd..2a39993 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -233,7 +233,7 @@ The SDK also includes named helpers for the remaining Agora-supported LLM provid | `model` | `str` | Yes | — | Deepgram TTS model (e.g., `aura-2-thalia-en`) | | `base_url` | `str` | No | `None` | WebSocket endpoint; defaults server-side to `wss://api.deepgram.com/v1/speak` | | `sample_rate` | `int` | No | `None` | Sample rate in Hz (for example, `24000`) | -| `params` | `Dict[str, Any]` | No | `None` | Additional Deepgram TTS parameters | +| `additional_params` | `Dict[str, Any]` | No | `None` | Additional Deepgram TTS parameters, flattened into `params` | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | ### `HumeAITTS` diff --git a/src/agora_agent/agentkit/vendors/tts.py b/src/agora_agent/agentkit/vendors/tts.py index dc90222..a052ea5 100644 --- a/src/agora_agent/agentkit/vendors/tts.py +++ b/src/agora_agent/agentkit/vendors/tts.py @@ -268,7 +268,7 @@ class DeepgramTTSOptions(BaseModel): model: str = Field(..., description="Deepgram TTS model (e.g., 'aura-2-thalia-en')") base_url: Optional[str] = Field(default=None, description="WebSocket endpoint") sample_rate: Optional[int] = Field(default=None, description="Sample rate in Hz") - params: Optional[Dict[str, Any]] = Field(default=None, description="Additional Deepgram TTS parameters") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional Deepgram TTS parameters") skip_patterns: Optional[List[int]] = Field(default=None) class DeepgramTTS(BaseTTS): @@ -280,17 +280,16 @@ def sample_rate(self) -> Optional[int]: return self.options.sample_rate def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = { + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ "api_key": self.options.api_key, "model": self.options.model, - **(self.options.params or {}), - } + }) if self.options.base_url is not None: params["base_url"] = self.options.base_url if self.options.sample_rate is not None: params["sample_rate"] = self.options.sample_rate - result: Dict[str, Any] = {"vendor": "deepgram", "params": params} if self.options.skip_patterns is not None: result["skip_patterns"] = self.options.skip_patterns diff --git a/tests/custom/test_tts_vendors.py b/tests/custom/test_tts_vendors.py index 0930419..2936872 100644 --- a/tests/custom/test_tts_vendors.py +++ b/tests/custom/test_tts_vendors.py @@ -1,6 +1,6 @@ import pytest -from agora_agent import AmazonTTS, CartesiaTTS, ElevenLabsTTS, FishAudioTTS, GoogleTTS, HumeAITTS, MiniMaxTTS, MurfTTS, OpenAITTS, RimeTTS, SarvamTTS +from agora_agent import AmazonTTS, CartesiaTTS, DeepgramTTS, ElevenLabsTTS, FishAudioTTS, GoogleTTS, HumeAITTS, MiniMaxTTS, MurfTTS, OpenAITTS, RimeTTS, SarvamTTS def test_tts_vendor_params_match_generated_core_shapes() -> None: @@ -44,6 +44,14 @@ def test_tts_vendor_params_match_generated_core_shapes() -> None: "voice_id": "voice", } + assert DeepgramTTS(api_key="deepgram-key", model="aura-2-thalia-en", base_url="wss://api.deepgram.com/v1/speak", sample_rate=24000, additional_params={"encoding": "linear16"}).to_config()["params"] == { + "api_key": "deepgram-key", + "model": "aura-2-thalia-en", + "base_url": "wss://api.deepgram.com/v1/speak", + "sample_rate": 24000, + "encoding": "linear16", + } + assert OpenAITTS(api_key="openai-key", voice="coral", model="gpt-4o-mini-tts", base_url="https://api.openai.com/v1").to_config()["params"] == { "voice": "coral", "api_key": "openai-key", From 403a1a9ac18fd05139ac0140d1837d28f5805dcb Mon Sep 17 00:00:00 2001 From: fern-api <115122769+fern-api[bot]@users.noreply.github.com> Date: Tue, 2 Jun 2026 07:03:42 +0000 Subject: [PATCH 18/26] [fern-generated] Update SDK Generated by Fern CLI Version: unknown Generators: - fernapi/fern-python-sdk: 4.37.0 --- reference.md | 21 ++- src/agora_agent/agents/client.py | 42 +++-- .../types/start_agents_request_properties.py | 12 +- .../start_agents_request_properties_asr.py | 47 ----- ...rt_agents_request_properties_asr_vendor.py | 10 - .../start_agents_request_properties_llm.py | 115 ------------ ...request_properties_llm_greeting_configs.py | 43 ----- ...st_properties_llm_greeting_configs_mode.py | 7 - ...request_properties_llm_mcp_servers_item.py | 54 ------ ...art_agents_request_properties_llm_style.py | 5 - .../start_agents_request_properties_mllm.py | 86 --------- ..._request_properties_mllm_turn_detection.py | 61 ------- ...es_mllm_turn_detection_agora_vad_config.py | 42 ----- ...est_properties_mllm_turn_detection_mode.py | 7 - ...mllm_turn_detection_semantic_vad_config.py | 32 ---- ...detection_semantic_vad_config_eagerness.py | 7 - ...s_mllm_turn_detection_server_vad_config.py | 62 ------- ...t_agents_request_properties_mllm_vendor.py | 5 - src/agora_agent/core/client_wrapper.py | 4 +- src/agora_agent/types/amazon_asr.py | 27 +++ src/agora_agent/types/amazon_asr_params.py | 52 ++++++ src/agora_agent/types/amazon_tts_params.py | 16 +- .../types/amazon_tts_params_engine.py | 5 + src/agora_agent/types/ares_asr.py | 27 +++ src/agora_agent/types/ares_asr_params.py | 5 + src/agora_agent/types/asr.py | 172 ++++++++++++++++++ src/agora_agent/types/asr_language.py | 41 +++++ src/agora_agent/types/assembly_ai_asr.py | 27 +++ .../types/assembly_ai_asr_params.py | 37 ++++ .../types/cartesia_tts_output_format.py | 32 ++++ src/agora_agent/types/cartesia_tts_params.py | 17 +- src/agora_agent/types/deepgram_asr.py | 31 ++++ src/agora_agent/types/deepgram_asr_params.py | 47 +++++ src/agora_agent/types/deepgram_tts_params.py | 5 - .../types/eleven_labs_tts_params.py | 27 ++- .../types/fish_audio_tts_params.py | 7 +- src/agora_agent/types/google_asr.py | 27 +++ src/agora_agent/types/google_asr_params.py | 47 +++++ .../types/google_tts_audio_config.py | 32 ++++ src/agora_agent/types/google_tts_params.py | 28 ++- .../google_tts_voice_selection_params.py | 27 +++ src/agora_agent/types/hume_ai_tts_params.py | 28 ++- .../types/hume_ai_tts_params_provider.py | 5 + src/agora_agent/types/llm.py | 120 ++++++++++++ src/agora_agent/types/llm_params.py | 32 ++++ src/agora_agent/types/llm_style.py | 5 + src/agora_agent/types/microsoft_asr.py | 27 +++ src/agora_agent/types/microsoft_asr_params.py | 42 +++++ src/agora_agent/types/microsoft_tts_params.py | 10 + src/agora_agent/types/mllm.py | 88 +++++++++ src/agora_agent/types/mllm_http_options.py | 27 +++ .../types/mllm_input_audio_transcription.py | 37 ++++ src/agora_agent/types/mllm_params.py | 71 ++++++++ src/agora_agent/types/mllm_turn_detection.py | 35 ++++ .../mllm_turn_detection_agora_vad_config.py | 23 +++ .../types/mllm_turn_detection_mode.py | 5 + ...mllm_turn_detection_semantic_vad_config.py | 21 +++ ...detection_semantic_vad_config_eagerness.py | 5 + .../mllm_turn_detection_server_vad_config.py | 31 ++++ ...r_vad_config_end_of_speech_sensitivity.py} | 2 +- ...vad_config_start_of_speech_sensitivity.py} | 2 +- src/agora_agent/types/mllm_vendor.py | 5 + src/agora_agent/types/murf_tts_params.py | 39 +++- src/agora_agent/types/open_ai_asr.py | 27 +++ src/agora_agent/types/open_ai_asr_params.py | 30 +++ .../open_ai_input_audio_transcription.py | 37 ++++ src/agora_agent/types/open_ai_tts_params.py | 17 +- src/agora_agent/types/rime_tts_params.py | 13 +- src/agora_agent/types/sarvam_asr.py | 27 +++ src/agora_agent/types/sarvam_asr_params.py | 32 ++++ src/agora_agent/types/sarvam_tts_params.py | 25 ++- .../sarvam_tts_params_target_language_code.py | 8 + src/agora_agent/types/speechmatics_asr.py | 27 +++ .../types/speechmatics_asr_params.py | 37 ++++ tests/custom/test_avatar_token.py | 12 -- tests/custom/test_llm_vendors.py | 60 ------ tests/custom/test_root_exports.py | 29 --- 77 files changed, 1676 insertions(+), 763 deletions(-) delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_asr.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_asr_vendor.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_llm.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs_mode.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_llm_mcp_servers_item.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_llm_style.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_agora_vad_config.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_mode.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py delete mode 100644 src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py create mode 100644 src/agora_agent/types/amazon_asr.py create mode 100644 src/agora_agent/types/amazon_asr_params.py create mode 100644 src/agora_agent/types/amazon_tts_params_engine.py create mode 100644 src/agora_agent/types/ares_asr.py create mode 100644 src/agora_agent/types/ares_asr_params.py create mode 100644 src/agora_agent/types/asr.py create mode 100644 src/agora_agent/types/asr_language.py create mode 100644 src/agora_agent/types/assembly_ai_asr.py create mode 100644 src/agora_agent/types/assembly_ai_asr_params.py create mode 100644 src/agora_agent/types/cartesia_tts_output_format.py create mode 100644 src/agora_agent/types/deepgram_asr.py create mode 100644 src/agora_agent/types/deepgram_asr_params.py create mode 100644 src/agora_agent/types/google_asr.py create mode 100644 src/agora_agent/types/google_asr_params.py create mode 100644 src/agora_agent/types/google_tts_audio_config.py create mode 100644 src/agora_agent/types/google_tts_voice_selection_params.py create mode 100644 src/agora_agent/types/hume_ai_tts_params_provider.py create mode 100644 src/agora_agent/types/llm.py create mode 100644 src/agora_agent/types/llm_params.py create mode 100644 src/agora_agent/types/llm_style.py create mode 100644 src/agora_agent/types/microsoft_asr.py create mode 100644 src/agora_agent/types/microsoft_asr_params.py create mode 100644 src/agora_agent/types/mllm.py create mode 100644 src/agora_agent/types/mllm_http_options.py create mode 100644 src/agora_agent/types/mllm_input_audio_transcription.py create mode 100644 src/agora_agent/types/mllm_params.py create mode 100644 src/agora_agent/types/mllm_turn_detection.py create mode 100644 src/agora_agent/types/mllm_turn_detection_agora_vad_config.py create mode 100644 src/agora_agent/types/mllm_turn_detection_mode.py create mode 100644 src/agora_agent/types/mllm_turn_detection_semantic_vad_config.py create mode 100644 src/agora_agent/types/mllm_turn_detection_semantic_vad_config_eagerness.py create mode 100644 src/agora_agent/types/mllm_turn_detection_server_vad_config.py rename src/agora_agent/{agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py => types/mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py} (61%) rename src/agora_agent/{agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py => types/mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py} (61%) create mode 100644 src/agora_agent/types/mllm_vendor.py create mode 100644 src/agora_agent/types/open_ai_asr.py create mode 100644 src/agora_agent/types/open_ai_asr_params.py create mode 100644 src/agora_agent/types/open_ai_input_audio_transcription.py create mode 100644 src/agora_agent/types/sarvam_asr.py create mode 100644 src/agora_agent/types/sarvam_asr_params.py create mode 100644 src/agora_agent/types/sarvam_tts_params_target_language_code.py create mode 100644 src/agora_agent/types/speechmatics_asr.py create mode 100644 src/agora_agent/types/speechmatics_asr_params.py delete mode 100644 tests/custom/test_avatar_token.py delete mode 100644 tests/custom/test_llm_vendors.py delete mode 100644 tests/custom/test_root_exports.py diff --git a/reference.md b/reference.md index 55a516e..57fc92a 100644 --- a/reference.md +++ b/reference.md @@ -27,11 +27,16 @@ Create and start a Conversational AI agent instance.
```python -from agora_agent import Agora, MicrosoftTtsParams, Tts_Microsoft +from agora_agent import ( + Agora, + Asr_Ares, + Llm, + LlmParams, + MicrosoftTtsParams, + Tts_Microsoft, +) from agora_agent.agents import ( StartAgentsRequestProperties, - StartAgentsRequestPropertiesAsr, - StartAgentsRequestPropertiesLlm, StartAgentsRequestPropertiesTurnDetection, StartAgentsRequestPropertiesTurnDetectionConfig, StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, @@ -51,9 +56,7 @@ client.agents.start( agent_rtc_uid="1001", remote_rtc_uids=["1002"], idle_timeout=120, - asr=StartAgentsRequestPropertiesAsr( - language="en-US", - ), + asr=Asr_Ares(), tts=Tts_Microsoft( params=MicrosoftTtsParams( key="key", @@ -61,13 +64,15 @@ client.agents.start( voice_name="voice_name", ), ), - llm=StartAgentsRequestPropertiesLlm( + llm=Llm( url="https://api.openai.com/v1/chat/completions", api_key="", system_messages=[ {"role": "system", "content": "You are a helpful chatbot."} ], - params={"model": "gpt-4o-mini"}, + params=LlmParams( + model="gpt-4o-mini", + ), max_history=32, greeting_message="Hello, how can I assist you today?", failure_message="Please hold on a second.", diff --git a/src/agora_agent/agents/client.py b/src/agora_agent/agents/client.py index 3f6af4c..e923c9a 100644 --- a/src/agora_agent/agents/client.py +++ b/src/agora_agent/agents/client.py @@ -84,11 +84,16 @@ def start( Examples -------- - from agora_agent import Agora, MicrosoftTtsParams, Tts_Microsoft + from agora_agent import ( + Agora, + Asr_Ares, + Llm, + LlmParams, + MicrosoftTtsParams, + Tts_Microsoft, + ) from agora_agent.agents import ( StartAgentsRequestProperties, - StartAgentsRequestPropertiesAsr, - StartAgentsRequestPropertiesLlm, StartAgentsRequestPropertiesTurnDetection, StartAgentsRequestPropertiesTurnDetectionConfig, StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, @@ -108,9 +113,7 @@ def start( agent_rtc_uid="1001", remote_rtc_uids=["1002"], idle_timeout=120, - asr=StartAgentsRequestPropertiesAsr( - language="en-US", - ), + asr=Asr_Ares(), tts=Tts_Microsoft( params=MicrosoftTtsParams( key="key", @@ -118,13 +121,15 @@ def start( voice_name="voice_name", ), ), - llm=StartAgentsRequestPropertiesLlm( + llm=Llm( url="https://api.openai.com/v1/chat/completions", api_key="", system_messages=[ {"role": "system", "content": "You are a helpful chatbot."} ], - params={"model": "gpt-4o-mini"}, + params=LlmParams( + model="gpt-4o-mini", + ), max_history=32, greeting_message="Hello, how can I assist you today?", failure_message="Please hold on a second.", @@ -641,11 +646,16 @@ async def start( -------- import asyncio - from agora_agent import AsyncAgora, MicrosoftTtsParams, Tts_Microsoft + from agora_agent import ( + Asr_Ares, + AsyncAgora, + Llm, + LlmParams, + MicrosoftTtsParams, + Tts_Microsoft, + ) from agora_agent.agents import ( StartAgentsRequestProperties, - StartAgentsRequestPropertiesAsr, - StartAgentsRequestPropertiesLlm, StartAgentsRequestPropertiesTurnDetection, StartAgentsRequestPropertiesTurnDetectionConfig, StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, @@ -668,9 +678,7 @@ async def main() -> None: agent_rtc_uid="1001", remote_rtc_uids=["1002"], idle_timeout=120, - asr=StartAgentsRequestPropertiesAsr( - language="en-US", - ), + asr=Asr_Ares(), tts=Tts_Microsoft( params=MicrosoftTtsParams( key="key", @@ -678,13 +686,15 @@ async def main() -> None: voice_name="voice_name", ), ), - llm=StartAgentsRequestPropertiesLlm( + llm=Llm( url="https://api.openai.com/v1/chat/completions", api_key="", system_messages=[ {"role": "system", "content": "You are a helpful chatbot."} ], - params={"model": "gpt-4o-mini"}, + params=LlmParams( + model="gpt-4o-mini", + ), max_history=32, greeting_message="Hello, how can I assist you today?", failure_message="Please hold on a second.", diff --git a/src/agora_agent/agents/types/start_agents_request_properties.py b/src/agora_agent/agents/types/start_agents_request_properties.py index 06c3482..3cddb7e 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties.py +++ b/src/agora_agent/agents/types/start_agents_request_properties.py @@ -5,15 +5,15 @@ import pydantic from ...core.pydantic_utilities import IS_PYDANTIC_V2 from ...core.unchecked_base_model import UncheckedBaseModel +from ...types.asr import Asr +from ...types.llm import Llm +from ...types.mllm import Mllm from ...types.tts import Tts from .start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures -from .start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr from .start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar from .start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords from .start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence from .start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption -from .start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm -from .start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm from .start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters from .start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc from .start_agents_request_properties_sal import StartAgentsRequestPropertiesSal @@ -67,7 +67,7 @@ class StartAgentsRequestProperties(UncheckedBaseModel): Advanced features configuration. """ - asr: typing.Optional[StartAgentsRequestPropertiesAsr] = pydantic.Field(default=None) + asr: typing.Optional[Asr] = pydantic.Field(default=None) """ Automatic Speech Recognition (ASR) configuration. """ @@ -77,12 +77,12 @@ class StartAgentsRequestProperties(UncheckedBaseModel): Text-to-speech (TTS) module configuration. """ - llm: typing.Optional[StartAgentsRequestPropertiesLlm] = pydantic.Field(default=None) + llm: typing.Optional[Llm] = pydantic.Field(default=None) """ Large language model (LLM) configuration. """ - mllm: typing.Optional[StartAgentsRequestPropertiesMllm] = pydantic.Field(default=None) + mllm: typing.Optional[Mllm] = pydantic.Field(default=None) """ Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing. `mllm` is an exclusive alternative to the standard `asr` + `llm` + `tts` pipeline. """ diff --git a/src/agora_agent/agents/types/start_agents_request_properties_asr.py b/src/agora_agent/agents/types/start_agents_request_properties_asr.py deleted file mode 100644 index 7385e17..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_asr.py +++ /dev/null @@ -1,47 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor - - -class StartAgentsRequestPropertiesAsr(UncheckedBaseModel): - """ - Automatic Speech Recognition (ASR) configuration. - """ - - language: typing.Optional[str] = pydantic.Field(default=None) - """ - The BCP-47 language tag identifying the primary language used for agent interaction. If `params` contains a vendor-specific language code, it takes precedence over this setting. - """ - - vendor: typing.Optional[StartAgentsRequestPropertiesAsrVendor] = pydantic.Field(default=None) - """ - ASR provider: - - `ares`: Adaptive Recognition Engine for Speech - - `microsoft`: Microsoft Azure - - `deepgram`: Deepgram - - `openai`: OpenAI (Beta) - - `speechmatics`: Speechmatics - - `assemblyai`: AssemblyAI (Beta) - - `amazon`: Amazon Transcribe (Beta) - - `google`: Google (Beta) - - `sarvam`: Sarvam (Beta) - """ - - params: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) - """ - The configuration parameters for the ASR vendor. See [ASR Overview](https://docs.agora.io/en/conversational-ai/models/asr/overview) for details. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_asr_vendor.py b/src/agora_agent/agents/types/start_agents_request_properties_asr_vendor.py deleted file mode 100644 index 973d62c..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_asr_vendor.py +++ /dev/null @@ -1,10 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesAsrVendor = typing.Union[ - typing.Literal[ - "ares", "microsoft", "deepgram", "openai", "google", "amazon", "assemblyai", "speechmatics", "sarvam" - ], - typing.Any, -] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm.py b/src/agora_agent/agents/types/start_agents_request_properties_llm.py deleted file mode 100644 index 9ab0f62..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm.py +++ /dev/null @@ -1,115 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs -from .start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem -from .start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle - - -class StartAgentsRequestPropertiesLlm(UncheckedBaseModel): - """ - Large language model (LLM) configuration. - """ - - url: str = pydantic.Field() - """ - The LLM callback address. - """ - - api_key: typing.Optional[str] = pydantic.Field(default=None) - """ - The LLM verification API key. The default value is an empty string. Ensure that you enable the API key in a production environment. - """ - - system_messages: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) - """ - A set of predefined information used as input to the LLM, including prompt words and examples. - """ - - params: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) - """ - Additional LLM configuration parameters, such as the `model` used, and the maximum token limit. For details about each supported LLM, refer to [Supported LLMs](https://docs.agora.io/en/conversational-ai/models/llm/overview#supported-llms). - """ - - max_history: typing.Optional[int] = pydantic.Field(default=None) - """ - The number of conversation history messages cached in the custom LLM. History includes user and agent dialog messages, tool call information, and timestamps. Agent and user messages are recorded separately. - """ - - input_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - LLM input modalities: - - `["text"]`: Text only - - `["text", "image"]`: Text plus image. Recommended configuration, requires the selected LLM to support visual input - """ - - output_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - LLM output modalities: - - `["text"]`: The output text is converted to speech by the TTS module and then published to the RTC channel. - - `["audio"]`: Voice only. Voice is published directly to the RTC channel. - - `["text", "audio"]`: Text plus voice. Write your own logic to process the output of LLM as needed. - """ - - greeting_message: typing.Optional[str] = pydantic.Field(default=None) - """ - Agent greeting. If provided, the first user in the channel is automatically greeted with the message upon joining. - """ - - failure_message: typing.Optional[str] = pydantic.Field(default=None) - """ - Prompt for agent activation failure. If provided, it is returned through TTS when the custom LLM call fails. - """ - - vendor: typing.Optional[str] = pydantic.Field(default=None) - """ - LLM provider, supports the following settings: - - `custom`: Custom LLM. When you set this option, the agent includes the following fields, in addition to `role` and `content` when making requests to the custom LLM: - - `turn_id`: A unique identifier for each conversation turn. It starts from `0` and increments with each turn. One user-agent interaction corresponds to one `turn_id`. - - `timestamp`: The request timestamp, in milliseconds. - - `azure`: Use this value for Azure OpenAI - """ - - style: typing.Optional[StartAgentsRequestPropertiesLlmStyle] = pydantic.Field(default=None) - """ - The request style for chat completion: - - `openai`: For OpenAI and OpenAI-compatible APIs - - `gemini`: For Google Gemini and Google Vertex API format - - `anthropic`: For Anthropic Claude API format - - `dify`: For Dify API format - """ - - greeting_configs: typing.Optional[StartAgentsRequestPropertiesLlmGreetingConfigs] = pydantic.Field(default=None) - """ - Agent greeting broadcast configuration. - """ - - template_variables: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) - """ - Template parameter configuration used to insert variables into the agent's `system_messages`, `greeting_message`, `failure_message`, and `parameters.silence_config.content` text. Uses key-value pairs, where the key is the variable name and the value is the variable's value. To insert defined variables in the prompt text, use the syntax `{{variable_name}}`. The system automatically replaces each variable with the corresponding value defined in `template_variables`. Variable values cannot reference other variables. - """ - - mcp_servers: typing.Optional[typing.List[StartAgentsRequestPropertiesLlmMcpServersItem]] = pydantic.Field( - default=None - ) - """ - MCP (Model Context Protocol) server configuration. By configuring MCP servers, agents can call tools provided by external services to implement advanced functionality. - """ - - headers: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) - """ - Custom headers to include in requests to the LLM. Use this field to pass business-specific information such as custom fields or tenant identifiers. These headers are merged with the headers generated by the Conversational AI Engine. If a key conflict occurs, the engine-generated header takes precedence. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py b/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py deleted file mode 100644 index c0d7046..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py +++ /dev/null @@ -1,43 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_llm_greeting_configs_mode import ( - StartAgentsRequestPropertiesLlmGreetingConfigsMode, -) - - -class StartAgentsRequestPropertiesLlmGreetingConfigs(UncheckedBaseModel): - """ - Agent greeting broadcast configuration. - """ - - mode: typing.Optional[StartAgentsRequestPropertiesLlmGreetingConfigsMode] = pydantic.Field(default=None) - """ - Determines when the agent sends greeting messages to users joining the channel. - - `single_every`: Broadcasts a greeting every time a user joins the channel. - - `single_first`: Broadcasts a greeting only once to the first user who joins the channel. - """ - - delay_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - The delay in milliseconds before the agent plays the greeting message after a user joins the channel. - """ - - interruptable: typing.Optional[bool] = pydantic.Field(default=None) - """ - - `true`: Follows the global `interruption` configuration. - - `false`: Uninterruptible. The greeting plays in its entirety. If the user speaks multiple times while the greeting plays, the system merges the speech segments after the greeting ends and sends them to the LLM for a single response. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs_mode.py b/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs_mode.py deleted file mode 100644 index 44e4a55..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs_mode.py +++ /dev/null @@ -1,7 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesLlmGreetingConfigsMode = typing.Union[ - typing.Literal["single_every", "single_first"], typing.Any -] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm_mcp_servers_item.py b/src/agora_agent/agents/types/start_agents_request_properties_llm_mcp_servers_item.py deleted file mode 100644 index 0474072..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm_mcp_servers_item.py +++ /dev/null @@ -1,54 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel - - -class StartAgentsRequestPropertiesLlmMcpServersItem(UncheckedBaseModel): - name: str = pydantic.Field() - """ - A unique identifier for the MCP server. Maximum 48 characters. Accepts only English letters and numbers. - """ - - endpoint: str = pydantic.Field() - """ - The endpoint address of the MCP server. The agent uses this to communicate with the MCP server. - """ - - transport: typing.Optional[typing.Literal["streamable_http"]] = pydantic.Field(default=None) - """ - Transport protocol type. - - `streamable_http`: Streaming HTTP protocol - """ - - headers: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) - """ - HTTP header information to include when requesting the MCP server, such as authentication information. - """ - - allowed_tools: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - A list of tools that the agent is allowed to invoke. The agent can only use tools on this list. - - Empty or omitted: All tools are enabled. - - Empty array `[]`: No tools are enabled. - - `["*"]`: All tools are enabled. - - Specific tools `["aa", "bb"]`: Only listed tools are enabled. - - Mix with wildcard `["aa", "*"]`: All tools are enabled (wildcard takes precedence). - """ - - timeout_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - The MCP server request timeout in milliseconds. After timeout, the agent stops waiting for the MCP server's response and continues executing subsequent logic. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm_style.py b/src/agora_agent/agents/types/start_agents_request_properties_llm_style.py deleted file mode 100644 index eaa9a0d..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm_style.py +++ /dev/null @@ -1,5 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesLlmStyle = typing.Union[typing.Literal["openai", "gemini", "anthropic", "dify"], typing.Any] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm.py deleted file mode 100644 index 0993ebc..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm.py +++ /dev/null @@ -1,86 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection -from .start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor - - -class StartAgentsRequestPropertiesMllm(UncheckedBaseModel): - """ - Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing. `mllm` is an exclusive alternative to the standard `asr` + `llm` + `tts` pipeline. - """ - - enable: typing.Optional[bool] = pydantic.Field(default=None) - """ - Enable Multimodal Large Language Model for voice-to-voice processing. Enabling MLLM automatically disables ASR, LLM, and TTS since the MLLM handles end-to-end voice processing directly. Replaces the deprecated `advanced_features.enable_mllm`. - """ - - url: typing.Optional[str] = pydantic.Field(default=None) - """ - The MLLM WebSocket URL for real-time communication. - """ - - api_key: typing.Optional[str] = pydantic.Field(default=None) - """ - The API key used for MLLM authentication. - """ - - messages: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) - """ - Array of conversation items used for short-term memory management. Uses the same structure as `item.content` from the OpenAI Realtime API. - """ - - params: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) - """ - Additional MLLM configuration parameters. The `modalities` setting is overridden by `input_modalities` and `output_modalities`. The `turn_detection` setting is overridden by `mllm.turn_detection`. - """ - - input_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - MLLM input modalities: - - `["audio"]`: Audio only - - `["audio", "text"]`: Audio plus text - """ - - output_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - MLLM output modalities: - - `["text", "audio"]`: Text plus audio - """ - - greeting_message: typing.Optional[str] = pydantic.Field(default=None) - """ - Agent greeting message. If provided, the first user in the channel is automatically greeted with this message upon joining. - """ - - failure_message: typing.Optional[str] = pydantic.Field(default=None) - """ - Agent failure message. If provided, the agent speaks this message when an MLLM request fails. - """ - - vendor: typing.Optional[StartAgentsRequestPropertiesMllmVendor] = pydantic.Field(default=None) - """ - MLLM provider. Currently supports: - - `openai`: OpenAI Realtime API - - `gemini`: Google Gemini Live - - `vertexai`: Google Gemini Live (Vertex AI) - - `xai`: xAI Grok Realtime API - """ - - turn_detection: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetection] = pydantic.Field(default=None) - """ - Turn detection configuration for the MLLM module. When defined, the top-level `turn_detection` object has no effect. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py deleted file mode 100644 index 032979d..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py +++ /dev/null @@ -1,61 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_mllm_turn_detection_agora_vad_config import ( - StartAgentsRequestPropertiesMllmTurnDetectionAgoraVadConfig, -) -from .start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode -from .start_agents_request_properties_mllm_turn_detection_semantic_vad_config import ( - StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfig, -) -from .start_agents_request_properties_mllm_turn_detection_server_vad_config import ( - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfig, -) - - -class StartAgentsRequestPropertiesMllmTurnDetection(UncheckedBaseModel): - """ - Turn detection configuration for the MLLM module. When defined, the top-level `turn_detection` object has no effect. - """ - - mode: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionMode] = pydantic.Field(default=None) - """ - Turn detection mode for MLLM: - - `agora_vad`: Agora VAD-based detection. - - `server_vad`: Vendor-side VAD-based detection. Supported by OpenAI Realtime API, Gemini Live, and xAI Grok. - - `semantic_vad`: Semantic-based detection. Supported by OpenAI Realtime API only. - """ - - agora_vad_config: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionAgoraVadConfig] = pydantic.Field( - default=None - ) - """ - Configuration for Agora VAD-based turn detection. Applicable when `mode` is `agora_vad`. - """ - - server_vad_config: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfig] = pydantic.Field( - default=None - ) - """ - Configuration for vendor-side VAD-based turn detection. Applicable when `mode` is `server_vad`. Parameters are passed through to the vendor. - """ - - semantic_vad_config: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfig] = ( - pydantic.Field(default=None) - ) - """ - Configuration for semantic-based turn detection. Applicable when `mode` is `semantic_vad`. Supported by OpenAI Realtime API only. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_agora_vad_config.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_agora_vad_config.py deleted file mode 100644 index ec30215..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_agora_vad_config.py +++ /dev/null @@ -1,42 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel - - -class StartAgentsRequestPropertiesMllmTurnDetectionAgoraVadConfig(UncheckedBaseModel): - """ - Configuration for Agora VAD-based turn detection. Applicable when `mode` is `agora_vad`. - """ - - interrupt_duration_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Minimum duration of speech in milliseconds required to trigger an interruption. - """ - - prefix_padding_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Duration of audio in milliseconds to include before the detected speech start. - """ - - silence_duration_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Duration of silence in milliseconds required to determine end of speech. - """ - - threshold: typing.Optional[float] = pydantic.Field(default=None) - """ - VAD sensitivity threshold. A higher value reduces false positives. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_mode.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_mode.py deleted file mode 100644 index 0d004e8..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_mode.py +++ /dev/null @@ -1,7 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesMllmTurnDetectionMode = typing.Union[ - typing.Literal["agora_vad", "server_vad", "semantic_vad"], typing.Any -] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config.py deleted file mode 100644 index 1e310f0..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config.py +++ /dev/null @@ -1,32 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness import ( - StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfigEagerness, -) - - -class StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfig(UncheckedBaseModel): - """ - Configuration for semantic-based turn detection. Applicable when `mode` is `semantic_vad`. Supported by OpenAI Realtime API only. - """ - - eagerness: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfigEagerness] = ( - pydantic.Field(default=None) - ) - """ - Controls how eagerly the model ends its turn. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness.py deleted file mode 100644 index 8b67b1d..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness.py +++ /dev/null @@ -1,7 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfigEagerness = typing.Union[ - typing.Literal["auto", "low", "medium", "high"], typing.Any -] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py deleted file mode 100644 index c74d8d7..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py +++ /dev/null @@ -1,62 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity import ( - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigEndOfSpeechSensitivity, -) -from .start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity import ( - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigStartOfSpeechSensitivity, -) - - -class StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfig(UncheckedBaseModel): - """ - Configuration for vendor-side VAD-based turn detection. Applicable when `mode` is `server_vad`. Parameters are passed through to the vendor. - """ - - prefix_padding_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Duration of audio in milliseconds to include before the detected speech start. - """ - - silence_duration_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Duration of silence in milliseconds required to determine end of speech. - """ - - threshold: typing.Optional[float] = pydantic.Field(default=None) - """ - VAD sensitivity threshold. Applicable to OpenAI Realtime API and xAI Grok. - """ - - idle_timeout_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Idle timeout in milliseconds. Applicable to OpenAI Realtime API only. - """ - - start_of_speech_sensitivity: typing.Optional[ - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigStartOfSpeechSensitivity - ] = pydantic.Field(default=None) - """ - Sensitivity for start of speech detection. Applicable to Gemini Live only. - """ - - end_of_speech_sensitivity: typing.Optional[ - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigEndOfSpeechSensitivity - ] = pydantic.Field(default=None) - """ - Sensitivity for end of speech detection. Applicable to Gemini Live only. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py deleted file mode 100644 index 0233696..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py +++ /dev/null @@ -1,5 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesMllmVendor = typing.Union[typing.Literal["openai", "gemini", "vertexai", "xai"], typing.Any] diff --git a/src/agora_agent/core/client_wrapper.py b/src/agora_agent/core/client_wrapper.py index c44e886..acd9073 100644 --- a/src/agora_agent/core/client_wrapper.py +++ b/src/agora_agent/core/client_wrapper.py @@ -26,10 +26,10 @@ def __init__( def get_headers(self) -> typing.Dict[str, str]: headers: typing.Dict[str, str] = { - "User-Agent": "agora-agents/v2.0.0", + "User-Agent": "agora-agents/v2.1.0", "X-Fern-Language": "Python", "X-Fern-SDK-Name": "agora-agents", - "X-Fern-SDK-Version": "v2.0.0", + "X-Fern-SDK-Version": "v2.1.0", **(self.get_custom_headers() or {}), } headers["Authorization"] = httpx.BasicAuth(self._get_username(), self._get_password())._auth_header diff --git a/src/agora_agent/types/amazon_asr.py b/src/agora_agent/types/amazon_asr.py new file mode 100644 index 0000000..4054518 --- /dev/null +++ b/src/agora_agent/types/amazon_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .amazon_asr_params import AmazonAsrParams +from .asr_language import AsrLanguage + + +class AmazonAsr(UncheckedBaseModel): + """ + Amazon Transcribe ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: AmazonAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/amazon_asr_params.py b/src/agora_agent/types/amazon_asr_params.py new file mode 100644 index 0000000..1d30688 --- /dev/null +++ b/src/agora_agent/types/amazon_asr_params.py @@ -0,0 +1,52 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class AmazonAsrParams(UncheckedBaseModel): + """ + Amazon Transcribe ASR configuration parameters. + """ + + region: str = pydantic.Field() + """ + AWS region + """ + + access_key_id: str = pydantic.Field() + """ + AWS access key ID + """ + + secret_access_key: str = pydantic.Field() + """ + AWS secret access key + """ + + language_code: str = pydantic.Field() + """ + Language code for speech recognition + """ + + media_sample_rate_hz: typing.Optional[int] = pydantic.Field(default=None) + """ + Sample rate in Hertz for the audio input + """ + + media_encoding: typing.Optional[str] = pydantic.Field(default=None) + """ + Encoding format of the audio input + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/amazon_tts_params.py b/src/agora_agent/types/amazon_tts_params.py index baaa6fa..bbecb36 100644 --- a/src/agora_agent/types/amazon_tts_params.py +++ b/src/agora_agent/types/amazon_tts_params.py @@ -5,6 +5,7 @@ import pydantic from ..core.pydantic_utilities import IS_PYDANTIC_V2 from ..core.unchecked_base_model import UncheckedBaseModel +from .amazon_tts_params_engine import AmazonTtsParamsEngine class AmazonTtsParams(UncheckedBaseModel): @@ -12,26 +13,31 @@ class AmazonTtsParams(UncheckedBaseModel): Amazon Polly TTS configuration parameters. """ - access_key: str = pydantic.Field() + aws_access_key_id: str = pydantic.Field() """ - AWS access key + AWS access key ID """ - secret_key: str = pydantic.Field() + aws_secret_access_key: str = pydantic.Field() """ AWS secret key """ - region: str = pydantic.Field() + region_name: str = pydantic.Field() """ AWS region (e.g., "us-east-1") """ - voice_id: str = pydantic.Field() + voice: str = pydantic.Field() """ Amazon Polly voice ID """ + engine: AmazonTtsParamsEngine = pydantic.Field() + """ + Amazon Polly engine type + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/amazon_tts_params_engine.py b/src/agora_agent/types/amazon_tts_params_engine.py new file mode 100644 index 0000000..d9e3cfe --- /dev/null +++ b/src/agora_agent/types/amazon_tts_params_engine.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +AmazonTtsParamsEngine = typing.Union[typing.Literal["standard", "neural", "long-form", "generative"], typing.Any] diff --git a/src/agora_agent/types/ares_asr.py b/src/agora_agent/types/ares_asr.py new file mode 100644 index 0000000..cf42216 --- /dev/null +++ b/src/agora_agent/types/ares_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .ares_asr_params import AresAsrParams +from .asr_language import AsrLanguage + + +class AresAsr(UncheckedBaseModel): + """ + Adaptive Recognition Engine for Speech ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: typing.Optional[AresAsrParams] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/ares_asr_params.py b/src/agora_agent/types/ares_asr_params.py new file mode 100644 index 0000000..afa1d76 --- /dev/null +++ b/src/agora_agent/types/ares_asr_params.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +AresAsrParams = typing.Dict[str, typing.Any] diff --git a/src/agora_agent/types/asr.py b/src/agora_agent/types/asr.py new file mode 100644 index 0000000..f08086f --- /dev/null +++ b/src/agora_agent/types/asr.py @@ -0,0 +1,172 @@ +# This file was auto-generated by Fern from our API Definition. + +from __future__ import annotations + +import typing + +import pydantic +import typing_extensions +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel, UnionMetadata +from .amazon_asr_params import AmazonAsrParams +from .ares_asr_params import AresAsrParams +from .asr_language import AsrLanguage +from .assembly_ai_asr_params import AssemblyAiAsrParams +from .deepgram_asr_params import DeepgramAsrParams +from .google_asr_params import GoogleAsrParams +from .microsoft_asr_params import MicrosoftAsrParams +from .open_ai_asr_params import OpenAiAsrParams +from .sarvam_asr_params import SarvamAsrParams +from .speechmatics_asr_params import SpeechmaticsAsrParams + + +class Asr_Ares(UncheckedBaseModel): + vendor: typing.Literal["ares"] = "ares" + language: typing.Optional[AsrLanguage] = None + params: typing.Optional[AresAsrParams] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Microsoft(UncheckedBaseModel): + vendor: typing.Literal["microsoft"] = "microsoft" + language: typing.Optional[AsrLanguage] = None + params: MicrosoftAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Deepgram(UncheckedBaseModel): + vendor: typing.Literal["deepgram"] = "deepgram" + language: typing.Optional[AsrLanguage] = None + params: DeepgramAsrParams + keyterm: typing.Optional[str] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Openai(UncheckedBaseModel): + vendor: typing.Literal["openai"] = "openai" + language: typing.Optional[AsrLanguage] = None + params: OpenAiAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Google(UncheckedBaseModel): + vendor: typing.Literal["google"] = "google" + language: typing.Optional[AsrLanguage] = None + params: GoogleAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Amazon(UncheckedBaseModel): + vendor: typing.Literal["amazon"] = "amazon" + language: typing.Optional[AsrLanguage] = None + params: AmazonAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Assemblyai(UncheckedBaseModel): + vendor: typing.Literal["assemblyai"] = "assemblyai" + language: typing.Optional[AsrLanguage] = None + params: AssemblyAiAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Speechmatics(UncheckedBaseModel): + vendor: typing.Literal["speechmatics"] = "speechmatics" + language: typing.Optional[AsrLanguage] = None + params: SpeechmaticsAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Sarvam(UncheckedBaseModel): + vendor: typing.Literal["sarvam"] = "sarvam" + language: typing.Optional[AsrLanguage] = None + params: SarvamAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +Asr = typing_extensions.Annotated[ + typing.Union[ + Asr_Ares, + Asr_Microsoft, + Asr_Deepgram, + Asr_Openai, + Asr_Google, + Asr_Amazon, + Asr_Assemblyai, + Asr_Speechmatics, + Asr_Sarvam, + ], + UnionMetadata(discriminant="vendor"), +] diff --git a/src/agora_agent/types/asr_language.py b/src/agora_agent/types/asr_language.py new file mode 100644 index 0000000..4ff3c88 --- /dev/null +++ b/src/agora_agent/types/asr_language.py @@ -0,0 +1,41 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +AsrLanguage = typing.Union[ + typing.Literal[ + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", + ], + typing.Any, +] diff --git a/src/agora_agent/types/assembly_ai_asr.py b/src/agora_agent/types/assembly_ai_asr.py new file mode 100644 index 0000000..ea2ebf4 --- /dev/null +++ b/src/agora_agent/types/assembly_ai_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .assembly_ai_asr_params import AssemblyAiAsrParams + + +class AssemblyAiAsr(UncheckedBaseModel): + """ + AssemblyAI ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: AssemblyAiAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/assembly_ai_asr_params.py b/src/agora_agent/types/assembly_ai_asr_params.py new file mode 100644 index 0000000..f3a5818 --- /dev/null +++ b/src/agora_agent/types/assembly_ai_asr_params.py @@ -0,0 +1,37 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class AssemblyAiAsrParams(UncheckedBaseModel): + """ + AssemblyAI ASR configuration parameters. + """ + + api_key: str = pydantic.Field() + """ + AssemblyAI API key + """ + + language: str = pydantic.Field() + """ + Language code for speech recognition + """ + + uri: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for AssemblyAI's streaming API + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/cartesia_tts_output_format.py b/src/agora_agent/types/cartesia_tts_output_format.py new file mode 100644 index 0000000..ab7e122 --- /dev/null +++ b/src/agora_agent/types/cartesia_tts_output_format.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class CartesiaTtsOutputFormat(UncheckedBaseModel): + """ + Cartesia audio output format configuration. + """ + + container: typing.Optional[str] = pydantic.Field(default=None) + """ + Audio container format for the output stream + """ + + sample_rate: typing.Optional[int] = pydantic.Field(default=None) + """ + Audio sampling rate in Hz + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/cartesia_tts_params.py b/src/agora_agent/types/cartesia_tts_params.py index 2aaf069..1478570 100644 --- a/src/agora_agent/types/cartesia_tts_params.py +++ b/src/agora_agent/types/cartesia_tts_params.py @@ -5,6 +5,7 @@ import pydantic from ..core.pydantic_utilities import IS_PYDANTIC_V2 from ..core.unchecked_base_model import UncheckedBaseModel +from .cartesia_tts_output_format import CartesiaTtsOutputFormat from .cartesia_tts_voice import CartesiaTtsVoice @@ -18,15 +19,21 @@ class CartesiaTtsParams(UncheckedBaseModel): Cartesia API key """ - voice: CartesiaTtsVoice - model_id: typing.Optional[str] = pydantic.Field(default=None) + model_id: str = pydantic.Field() """ - Model ID (optional) + Model ID (for example, sonic-2) """ - sample_rate: typing.Optional[int] = pydantic.Field(default=None) + base_url: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for the Cartesia streaming API + """ + + voice: CartesiaTtsVoice + output_format: typing.Optional[CartesiaTtsOutputFormat] = None + language: typing.Optional[str] = pydantic.Field(default=None) """ - Audio sampling rate in Hz + Target language for speech synthesis """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/types/deepgram_asr.py b/src/agora_agent/types/deepgram_asr.py new file mode 100644 index 0000000..1c79c7b --- /dev/null +++ b/src/agora_agent/types/deepgram_asr.py @@ -0,0 +1,31 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .deepgram_asr_params import DeepgramAsrParams + + +class DeepgramAsr(UncheckedBaseModel): + """ + Deepgram ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: DeepgramAsrParams + keyterm: typing.Optional[str] = pydantic.Field(default=None) + """ + Boost specialized terms and brands for preset-backed Deepgram usage. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/deepgram_asr_params.py b/src/agora_agent/types/deepgram_asr_params.py new file mode 100644 index 0000000..259958e --- /dev/null +++ b/src/agora_agent/types/deepgram_asr_params.py @@ -0,0 +1,47 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class DeepgramAsrParams(UncheckedBaseModel): + """ + Deepgram ASR configuration parameters. + """ + + url: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for Deepgram's streaming API + """ + + key: str = pydantic.Field() + """ + Deepgram API key + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + Speech recognition model + """ + + language: typing.Optional[str] = pydantic.Field(default=None) + """ + Language code for speech recognition + """ + + keyterm: typing.Optional[str] = pydantic.Field(default=None) + """ + Boost specialized terms and brands + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/deepgram_tts_params.py b/src/agora_agent/types/deepgram_tts_params.py index e858291..ebac500 100644 --- a/src/agora_agent/types/deepgram_tts_params.py +++ b/src/agora_agent/types/deepgram_tts_params.py @@ -32,11 +32,6 @@ class DeepgramTtsParams(UncheckedBaseModel): Audio sampling rate in Hz """ - params: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) - """ - Additional Deepgram TTS parameters - """ - skip_patterns: typing.Optional[typing.List[int]] = pydantic.Field(default=None) """ Controls whether the TTS module skips bracketed content when reading LLM response text. diff --git a/src/agora_agent/types/eleven_labs_tts_params.py b/src/agora_agent/types/eleven_labs_tts_params.py index c6127fd..4a2bf8f 100644 --- a/src/agora_agent/types/eleven_labs_tts_params.py +++ b/src/agora_agent/types/eleven_labs_tts_params.py @@ -12,7 +12,7 @@ class ElevenLabsTtsParams(UncheckedBaseModel): ElevenLabs TTS configuration parameters. """ - base_url: typing.Optional[str] = pydantic.Field(default=None) + base_url: str = pydantic.Field() """ WebSocket URL (e.g., "wss://api.elevenlabs.io/v1") """ @@ -37,6 +37,31 @@ class ElevenLabsTtsParams(UncheckedBaseModel): Audio sample rate in Hz (16kHz for Akool, 24kHz for HeyGen) """ + speed: typing.Optional[float] = pydantic.Field(default=None) + """ + Speech speed multiplier. + """ + + stability: typing.Optional[float] = pydantic.Field(default=None) + """ + Voice stability. Higher values produce more consistent speech. + """ + + similarity_boost: typing.Optional[float] = pydantic.Field(default=None) + """ + Similarity boost for the selected voice. + """ + + style: typing.Optional[float] = pydantic.Field(default=None) + """ + Speaking style and expressiveness control. + """ + + use_speaker_boost: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to improve voice quality and similarity. + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/fish_audio_tts_params.py b/src/agora_agent/types/fish_audio_tts_params.py index 0ad77aa..9bb4ebb 100644 --- a/src/agora_agent/types/fish_audio_tts_params.py +++ b/src/agora_agent/types/fish_audio_tts_params.py @@ -12,7 +12,7 @@ class FishAudioTtsParams(UncheckedBaseModel): Fish Audio TTS configuration parameters. """ - key: str = pydantic.Field() + api_key: str = pydantic.Field() """ Fish Audio API key """ @@ -22,6 +22,11 @@ class FishAudioTtsParams(UncheckedBaseModel): Fish Audio reference ID """ + backend: str = pydantic.Field() + """ + Backend model version to use + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/google_asr.py b/src/agora_agent/types/google_asr.py new file mode 100644 index 0000000..8473a04 --- /dev/null +++ b/src/agora_agent/types/google_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .google_asr_params import GoogleAsrParams + + +class GoogleAsr(UncheckedBaseModel): + """ + Google ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: GoogleAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/google_asr_params.py b/src/agora_agent/types/google_asr_params.py new file mode 100644 index 0000000..9d17db6 --- /dev/null +++ b/src/agora_agent/types/google_asr_params.py @@ -0,0 +1,47 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class GoogleAsrParams(UncheckedBaseModel): + """ + Google ASR configuration parameters. + """ + + project_id: str = pydantic.Field() + """ + Google Cloud project ID + """ + + location: str = pydantic.Field() + """ + Google Cloud region for the speech service + """ + + adc_credentials_string: str = pydantic.Field() + """ + Google Cloud service account credentials JSON string + """ + + language: str = pydantic.Field() + """ + Language code for speech recognition + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + Recognition model to use + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/google_tts_audio_config.py b/src/agora_agent/types/google_tts_audio_config.py new file mode 100644 index 0000000..9c2a405 --- /dev/null +++ b/src/agora_agent/types/google_tts_audio_config.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class GoogleTtsAudioConfig(UncheckedBaseModel): + """ + Google audio output configuration. + """ + + speaking_rate: typing.Optional[float] = pydantic.Field(default=None) + """ + Speed of speech + """ + + sample_rate_hertz: typing.Optional[int] = pydantic.Field(default=None) + """ + Sample rate in Hz + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/google_tts_params.py b/src/agora_agent/types/google_tts_params.py index dc00322..4a9ee38 100644 --- a/src/agora_agent/types/google_tts_params.py +++ b/src/agora_agent/types/google_tts_params.py @@ -3,8 +3,12 @@ import typing import pydantic +import typing_extensions from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.serialization import FieldMetadata from ..core.unchecked_base_model import UncheckedBaseModel +from .google_tts_audio_config import GoogleTtsAudioConfig +from .google_tts_voice_selection_params import GoogleTtsVoiceSelectionParams class GoogleTtsParams(UncheckedBaseModel): @@ -12,25 +16,17 @@ class GoogleTtsParams(UncheckedBaseModel): Google TTS configuration parameters. """ - key: str = pydantic.Field() + credentials: str = pydantic.Field() """ - Google Cloud API key + Google Cloud service account credentials JSON string """ - voice_name: str = pydantic.Field() - """ - Google voice name - """ - - language_code: typing.Optional[str] = pydantic.Field(default=None) - """ - Language code (e.g., "en-US") - """ - - sample_rate_hertz: typing.Optional[int] = pydantic.Field(default=None) - """ - Sample rate in Hz (default depends on selected voice) - """ + voice_selection_params: typing_extensions.Annotated[ + GoogleTtsVoiceSelectionParams, FieldMetadata(alias="VoiceSelectionParams") + ] + audio_config: typing_extensions.Annotated[ + typing.Optional[GoogleTtsAudioConfig], FieldMetadata(alias="AudioConfig") + ] = None if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 diff --git a/src/agora_agent/types/google_tts_voice_selection_params.py b/src/agora_agent/types/google_tts_voice_selection_params.py new file mode 100644 index 0000000..ee75953 --- /dev/null +++ b/src/agora_agent/types/google_tts_voice_selection_params.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class GoogleTtsVoiceSelectionParams(UncheckedBaseModel): + """ + Google voice selection parameters. + """ + + name: str = pydantic.Field() + """ + Google voice name + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/hume_ai_tts_params.py b/src/agora_agent/types/hume_ai_tts_params.py index 08cb12b..00c9f54 100644 --- a/src/agora_agent/types/hume_ai_tts_params.py +++ b/src/agora_agent/types/hume_ai_tts_params.py @@ -5,6 +5,7 @@ import pydantic from ..core.pydantic_utilities import IS_PYDANTIC_V2 from ..core.unchecked_base_model import UncheckedBaseModel +from .hume_ai_tts_params_provider import HumeAiTtsParamsProvider class HumeAiTtsParams(UncheckedBaseModel): @@ -17,9 +18,34 @@ class HumeAiTtsParams(UncheckedBaseModel): Hume AI API key """ + voice_id: str = pydantic.Field() + """ + Hume AI voice ID + """ + + base_url: typing.Optional[str] = pydantic.Field(default=None) + """ + Base URL for the Hume AI API + """ + + provider: HumeAiTtsParamsProvider = pydantic.Field() + """ + Voice provider type + """ + + speed: typing.Optional[float] = pydantic.Field(default=None) + """ + Playback speed of the generated speech + """ + + trailing_silence: typing.Optional[float] = pydantic.Field(default=None) + """ + Duration of silence in seconds to add at the end of each utterance + """ + config_id: typing.Optional[str] = pydantic.Field(default=None) """ - Hume AI configuration ID + Hume AI configuration ID. Deprecated; use voice_id for the documented TTS shape. """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/types/hume_ai_tts_params_provider.py b/src/agora_agent/types/hume_ai_tts_params_provider.py new file mode 100644 index 0000000..cf07e73 --- /dev/null +++ b/src/agora_agent/types/hume_ai_tts_params_provider.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +HumeAiTtsParamsProvider = typing.Union[typing.Literal["HUME_AI", "CUSTOM_VOICE"], typing.Any] diff --git a/src/agora_agent/types/llm.py b/src/agora_agent/types/llm.py new file mode 100644 index 0000000..2b0283d --- /dev/null +++ b/src/agora_agent/types/llm.py @@ -0,0 +1,120 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .llm_params import LlmParams +from .llm_style import LlmStyle + + +class Llm(UncheckedBaseModel): + """ + Large language model (LLM) configuration. + """ + + url: typing.Optional[str] = pydantic.Field(default=None) + """ + The LLM callback address. + """ + + api_key: typing.Optional[str] = pydantic.Field(default=None) + """ + The LLM verification API key. + """ + + access_key: typing.Optional[str] = pydantic.Field(default=None) + """ + AWS access key ID. Used by Amazon Bedrock when api_key is not provided. + """ + + secret_key: typing.Optional[str] = pydantic.Field(default=None) + """ + AWS secret access key. Used by Amazon Bedrock when api_key is not provided. + """ + + region: typing.Optional[str] = pydantic.Field(default=None) + """ + AWS region. Used by Amazon Bedrock. + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + Top-level model identifier. Used by Amazon Bedrock. + """ + + system_messages: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) + """ + A set of predefined information used as input to the LLM. + """ + + params: typing.Optional[LlmParams] = None + max_history: typing.Optional[int] = pydantic.Field(default=None) + """ + The number of conversation history messages cached in the custom LLM. + """ + + input_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + LLM input modalities. + """ + + output_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + LLM output modalities. + """ + + greeting_message: typing.Optional[str] = pydantic.Field(default=None) + """ + Agent greeting. + """ + + failure_message: typing.Optional[str] = pydantic.Field(default=None) + """ + Prompt for agent activation failure. + """ + + vendor: typing.Optional[str] = pydantic.Field(default=None) + """ + LLM provider identifier. + """ + + style: typing.Optional[LlmStyle] = pydantic.Field(default=None) + """ + The request style for chat completion. + """ + + ignore_empty: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to handle empty Gemini responses. + """ + + greeting_configs: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) + """ + Agent greeting broadcast configuration. + """ + + template_variables: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) + """ + Template parameter configuration. + """ + + mcp_servers: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) + """ + MCP server configuration. + """ + + headers: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) + """ + Custom headers to include in requests to the LLM. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/llm_params.py b/src/agora_agent/types/llm_params.py new file mode 100644 index 0000000..f6df01f --- /dev/null +++ b/src/agora_agent/types/llm_params.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class LlmParams(UncheckedBaseModel): + """ + Additional LLM configuration parameters. + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + The LLM model identifier. + """ + + max_tokens: typing.Optional[int] = pydantic.Field(default=None) + """ + Maximum tokens in the response. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/llm_style.py b/src/agora_agent/types/llm_style.py new file mode 100644 index 0000000..8319ca1 --- /dev/null +++ b/src/agora_agent/types/llm_style.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +LlmStyle = typing.Union[typing.Literal["openai", "gemini", "anthropic", "dify", "bedrock"], typing.Any] diff --git a/src/agora_agent/types/microsoft_asr.py b/src/agora_agent/types/microsoft_asr.py new file mode 100644 index 0000000..f602e09 --- /dev/null +++ b/src/agora_agent/types/microsoft_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .microsoft_asr_params import MicrosoftAsrParams + + +class MicrosoftAsr(UncheckedBaseModel): + """ + Microsoft Azure ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: MicrosoftAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/microsoft_asr_params.py b/src/agora_agent/types/microsoft_asr_params.py new file mode 100644 index 0000000..bea79e4 --- /dev/null +++ b/src/agora_agent/types/microsoft_asr_params.py @@ -0,0 +1,42 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class MicrosoftAsrParams(UncheckedBaseModel): + """ + Microsoft Azure ASR configuration parameters. + """ + + key: str = pydantic.Field() + """ + Microsoft Azure API key + """ + + region: str = pydantic.Field() + """ + Azure region + """ + + language: str = pydantic.Field() + """ + Language code for speech recognition + """ + + phrase_list: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + Words or phrases to improve recognition accuracy + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/microsoft_tts_params.py b/src/agora_agent/types/microsoft_tts_params.py index 3c9e80c..12f441e 100644 --- a/src/agora_agent/types/microsoft_tts_params.py +++ b/src/agora_agent/types/microsoft_tts_params.py @@ -32,6 +32,16 @@ class MicrosoftTtsParams(UncheckedBaseModel): Audio sampling rate in Hz """ + speed: typing.Optional[float] = pydantic.Field(default=None) + """ + Speaking rate multiplier. Values between 0.5 and 2.0. + """ + + volume: typing.Optional[float] = pydantic.Field(default=None) + """ + Audio volume. Values between 0.0 and 100.0. + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/mllm.py b/src/agora_agent/types/mllm.py new file mode 100644 index 0000000..3bcdb95 --- /dev/null +++ b/src/agora_agent/types/mllm.py @@ -0,0 +1,88 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_params import MllmParams +from .mllm_turn_detection import MllmTurnDetection +from .mllm_vendor import MllmVendor + + +class Mllm(UncheckedBaseModel): + """ + Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing. + """ + + enable: typing.Optional[bool] = pydantic.Field(default=None) + """ + Enable Multimodal Large Language Model. + """ + + url: typing.Optional[str] = pydantic.Field(default=None) + """ + The MLLM WebSocket URL for real-time communication. + """ + + api_key: typing.Optional[str] = pydantic.Field(default=None) + """ + The API key used for MLLM authentication. + """ + + adc_credentials_string: typing.Optional[str] = pydantic.Field(default=None) + """ + Base64-encoded Google Cloud Application Default Credentials. Used by Vertex AI. + """ + + project_id: typing.Optional[str] = pydantic.Field(default=None) + """ + Google Cloud project ID. Used by Vertex AI. + """ + + location: typing.Optional[str] = pydantic.Field(default=None) + """ + Google Cloud location or region. Used by Vertex AI. + """ + + messages: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) + """ + Array of conversation items used for short-term memory management. + """ + + params: typing.Optional[MllmParams] = None + input_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + MLLM input modalities. + """ + + output_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + MLLM output modalities. + """ + + greeting_message: typing.Optional[str] = pydantic.Field(default=None) + """ + Agent greeting message. + """ + + failure_message: typing.Optional[str] = pydantic.Field(default=None) + """ + Agent failure message. + """ + + vendor: typing.Optional[MllmVendor] = pydantic.Field(default=None) + """ + MLLM provider. + """ + + turn_detection: typing.Optional[MllmTurnDetection] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_http_options.py b/src/agora_agent/types/mllm_http_options.py new file mode 100644 index 0000000..19baebb --- /dev/null +++ b/src/agora_agent/types/mllm_http_options.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class MllmHttpOptions(UncheckedBaseModel): + """ + HTTP request options for the MLLM provider. + """ + + api_version: typing.Optional[str] = pydantic.Field(default=None) + """ + API version to use. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_input_audio_transcription.py b/src/agora_agent/types/mllm_input_audio_transcription.py new file mode 100644 index 0000000..6bb3d9d --- /dev/null +++ b/src/agora_agent/types/mllm_input_audio_transcription.py @@ -0,0 +1,37 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class MllmInputAudioTranscription(UncheckedBaseModel): + """ + Configuration for audio input transcription. + """ + + language: typing.Optional[str] = pydantic.Field(default=None) + """ + Language of the input audio. + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + Model to use for transcription. + """ + + prompt: typing.Optional[str] = pydantic.Field(default=None) + """ + Text to guide the transcription model. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_params.py b/src/agora_agent/types/mllm_params.py new file mode 100644 index 0000000..5437b69 --- /dev/null +++ b/src/agora_agent/types/mllm_params.py @@ -0,0 +1,71 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_http_options import MllmHttpOptions +from .mllm_input_audio_transcription import MllmInputAudioTranscription + + +class MllmParams(UncheckedBaseModel): + """ + Additional MLLM configuration parameters. + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + The MLLM model identifier. + """ + + voice: typing.Optional[str] = pydantic.Field(default=None) + """ + Voice identifier for audio output. + """ + + instructions: typing.Optional[str] = pydantic.Field(default=None) + """ + System instructions that define the agent behavior or tone. + """ + + input_audio_transcription: typing.Optional[MllmInputAudioTranscription] = None + affective_dialog: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to enable Gemini affective dialog. + """ + + proactive_audio: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether Gemini may choose not to respond when no reply is needed. + """ + + transcribe_agent: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to transcribe the agent speech in real time. + """ + + transcribe_user: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to transcribe the user speech in real time. + """ + + http_options: typing.Optional[MllmHttpOptions] = None + language: typing.Optional[str] = pydantic.Field(default=None) + """ + Language code for xAI Grok speech recognition and synthesis. + """ + + sample_rate: typing.Optional[int] = pydantic.Field(default=None) + """ + Audio sample rate in Hz. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_turn_detection.py b/src/agora_agent/types/mllm_turn_detection.py new file mode 100644 index 0000000..2cd3503 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection.py @@ -0,0 +1,35 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_turn_detection_agora_vad_config import MllmTurnDetectionAgoraVadConfig +from .mllm_turn_detection_mode import MllmTurnDetectionMode +from .mllm_turn_detection_semantic_vad_config import MllmTurnDetectionSemanticVadConfig +from .mllm_turn_detection_server_vad_config import MllmTurnDetectionServerVadConfig + + +class MllmTurnDetection(UncheckedBaseModel): + """ + Turn detection configuration for the MLLM module. + """ + + mode: typing.Optional[MllmTurnDetectionMode] = pydantic.Field(default=None) + """ + Turn detection mode for MLLM. + """ + + agora_vad_config: typing.Optional[MllmTurnDetectionAgoraVadConfig] = None + server_vad_config: typing.Optional[MllmTurnDetectionServerVadConfig] = None + semantic_vad_config: typing.Optional[MllmTurnDetectionSemanticVadConfig] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_turn_detection_agora_vad_config.py b/src/agora_agent/types/mllm_turn_detection_agora_vad_config.py new file mode 100644 index 0000000..4168ef3 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_agora_vad_config.py @@ -0,0 +1,23 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class MllmTurnDetectionAgoraVadConfig(UncheckedBaseModel): + interrupt_duration_ms: typing.Optional[int] = None + prefix_padding_ms: typing.Optional[int] = None + silence_duration_ms: typing.Optional[int] = None + threshold: typing.Optional[float] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_turn_detection_mode.py b/src/agora_agent/types/mllm_turn_detection_mode.py new file mode 100644 index 0000000..f6cd693 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_mode.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +MllmTurnDetectionMode = typing.Union[typing.Literal["agora_vad", "server_vad", "semantic_vad"], typing.Any] diff --git a/src/agora_agent/types/mllm_turn_detection_semantic_vad_config.py b/src/agora_agent/types/mllm_turn_detection_semantic_vad_config.py new file mode 100644 index 0000000..aeaf440 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_semantic_vad_config.py @@ -0,0 +1,21 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_turn_detection_semantic_vad_config_eagerness import MllmTurnDetectionSemanticVadConfigEagerness + + +class MllmTurnDetectionSemanticVadConfig(UncheckedBaseModel): + eagerness: typing.Optional[MllmTurnDetectionSemanticVadConfigEagerness] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_turn_detection_semantic_vad_config_eagerness.py b/src/agora_agent/types/mllm_turn_detection_semantic_vad_config_eagerness.py new file mode 100644 index 0000000..dbf9b4d --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_semantic_vad_config_eagerness.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +MllmTurnDetectionSemanticVadConfigEagerness = typing.Union[typing.Literal["auto", "low", "medium", "high"], typing.Any] diff --git a/src/agora_agent/types/mllm_turn_detection_server_vad_config.py b/src/agora_agent/types/mllm_turn_detection_server_vad_config.py new file mode 100644 index 0000000..b2976b3 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_server_vad_config.py @@ -0,0 +1,31 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_turn_detection_server_vad_config_end_of_speech_sensitivity import ( + MllmTurnDetectionServerVadConfigEndOfSpeechSensitivity, +) +from .mllm_turn_detection_server_vad_config_start_of_speech_sensitivity import ( + MllmTurnDetectionServerVadConfigStartOfSpeechSensitivity, +) + + +class MllmTurnDetectionServerVadConfig(UncheckedBaseModel): + prefix_padding_ms: typing.Optional[int] = None + silence_duration_ms: typing.Optional[int] = None + threshold: typing.Optional[float] = None + idle_timeout_ms: typing.Optional[int] = None + start_of_speech_sensitivity: typing.Optional[MllmTurnDetectionServerVadConfigStartOfSpeechSensitivity] = None + end_of_speech_sensitivity: typing.Optional[MllmTurnDetectionServerVadConfigEndOfSpeechSensitivity] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py b/src/agora_agent/types/mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py similarity index 61% rename from src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py rename to src/agora_agent/types/mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py index e92d3f1..b9b3377 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py +++ b/src/agora_agent/types/mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py @@ -2,6 +2,6 @@ import typing -StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigEndOfSpeechSensitivity = typing.Union[ +MllmTurnDetectionServerVadConfigEndOfSpeechSensitivity = typing.Union[ typing.Literal["END_SENSITIVITY_HIGH", "END_SENSITIVITY_LOW"], typing.Any ] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py b/src/agora_agent/types/mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py similarity index 61% rename from src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py rename to src/agora_agent/types/mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py index 25860c1..90ccf51 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py +++ b/src/agora_agent/types/mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py @@ -2,6 +2,6 @@ import typing -StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigStartOfSpeechSensitivity = typing.Union[ +MllmTurnDetectionServerVadConfigStartOfSpeechSensitivity = typing.Union[ typing.Literal["START_SENSITIVITY_HIGH", "START_SENSITIVITY_LOW"], typing.Any ] diff --git a/src/agora_agent/types/mllm_vendor.py b/src/agora_agent/types/mllm_vendor.py new file mode 100644 index 0000000..61c4d1a --- /dev/null +++ b/src/agora_agent/types/mllm_vendor.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +MllmVendor = typing.Union[typing.Literal["openai", "gemini", "vertexai", "xai"], typing.Any] diff --git a/src/agora_agent/types/murf_tts_params.py b/src/agora_agent/types/murf_tts_params.py index 5107f62..78f78d8 100644 --- a/src/agora_agent/types/murf_tts_params.py +++ b/src/agora_agent/types/murf_tts_params.py @@ -3,7 +3,9 @@ import typing import pydantic +import typing_extensions from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.serialization import FieldMetadata from ..core.unchecked_base_model import UncheckedBaseModel @@ -12,19 +14,46 @@ class MurfTtsParams(UncheckedBaseModel): Murf TTS configuration parameters. """ - key: str = pydantic.Field() + api_key: str = pydantic.Field() """ Murf API key """ - voice_id: str = pydantic.Field() + base_url: typing.Optional[str] = pydantic.Field(default=None) """ - Voice ID (e.g., Ariana, Natalie, Ken) + WebSocket endpoint for streaming TTS output """ - style: typing.Optional[str] = pydantic.Field(default=None) + voice_id: typing_extensions.Annotated[typing.Optional[str], FieldMetadata(alias="voiceId")] = pydantic.Field( + default=None + ) """ - Voice style (e.g., Angry, Sad, Conversational, Newscast) + Voice ID (e.g., Matthew) + """ + + locale: typing.Optional[str] = pydantic.Field(default=None) + """ + Locale for the selected voice + """ + + rate: typing.Optional[float] = pydantic.Field(default=None) + """ + Speech rate adjustment + """ + + pitch: typing.Optional[float] = pydantic.Field(default=None) + """ + Pitch adjustment + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + TTS model to use + """ + + sample_rate: typing.Optional[float] = pydantic.Field(default=None) + """ + Audio sample rate in Hz """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/types/open_ai_asr.py b/src/agora_agent/types/open_ai_asr.py new file mode 100644 index 0000000..eec2aab --- /dev/null +++ b/src/agora_agent/types/open_ai_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .open_ai_asr_params import OpenAiAsrParams + + +class OpenAiAsr(UncheckedBaseModel): + """ + OpenAI ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: OpenAiAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/open_ai_asr_params.py b/src/agora_agent/types/open_ai_asr_params.py new file mode 100644 index 0000000..a5fadc8 --- /dev/null +++ b/src/agora_agent/types/open_ai_asr_params.py @@ -0,0 +1,30 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .open_ai_input_audio_transcription import OpenAiInputAudioTranscription + + +class OpenAiAsrParams(UncheckedBaseModel): + """ + OpenAI ASR configuration parameters. + """ + + api_key: str = pydantic.Field() + """ + OpenAI API key + """ + + input_audio_transcription: OpenAiInputAudioTranscription + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/open_ai_input_audio_transcription.py b/src/agora_agent/types/open_ai_input_audio_transcription.py new file mode 100644 index 0000000..9db45b1 --- /dev/null +++ b/src/agora_agent/types/open_ai_input_audio_transcription.py @@ -0,0 +1,37 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class OpenAiInputAudioTranscription(UncheckedBaseModel): + """ + OpenAI audio transcription configuration. + """ + + model: str = pydantic.Field() + """ + OpenAI ASR model to use for transcription + """ + + prompt: str = pydantic.Field() + """ + Prompt that guides the transcription process + """ + + language: str = pydantic.Field() + """ + Language code to use for transcription + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/open_ai_tts_params.py b/src/agora_agent/types/open_ai_tts_params.py index 3839646..c8f6e51 100644 --- a/src/agora_agent/types/open_ai_tts_params.py +++ b/src/agora_agent/types/open_ai_tts_params.py @@ -14,7 +14,12 @@ class OpenAiTtsParams(UncheckedBaseModel): api_key: typing.Optional[str] = pydantic.Field(default=None) """ - OpenAI API key. Optional for Agora-managed OpenAI TTS usage. + OpenAI API key. Optional for preset-backed OpenAI TTS usage. + """ + + base_url: typing.Optional[str] = pydantic.Field(default=None) + """ + Endpoint URL for the OpenAI TTS service. """ voice: str = pydantic.Field() @@ -27,6 +32,16 @@ class OpenAiTtsParams(UncheckedBaseModel): Model name (e.g., "tts-1", "tts-1-hd") """ + instructions: typing.Optional[str] = pydantic.Field(default=None) + """ + Custom instructions for voice style, accent, pace, and tone. + """ + + speed: typing.Optional[float] = pydantic.Field(default=None) + """ + Speaking rate multiplier. + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/rime_tts_params.py b/src/agora_agent/types/rime_tts_params.py index 6d18375..ade1c5b 100644 --- a/src/agora_agent/types/rime_tts_params.py +++ b/src/agora_agent/types/rime_tts_params.py @@ -3,7 +3,9 @@ import typing import pydantic +import typing_extensions from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.serialization import FieldMetadata from ..core.unchecked_base_model import UncheckedBaseModel @@ -12,7 +14,7 @@ class RimeTtsParams(UncheckedBaseModel): Rime TTS configuration parameters. """ - key: str = pydantic.Field() + api_key: str = pydantic.Field() """ Rime API key """ @@ -22,9 +24,14 @@ class RimeTtsParams(UncheckedBaseModel): Rime speaker ID """ - model_id: typing.Optional[str] = pydantic.Field(default=None) + model_id: typing_extensions.Annotated[str, FieldMetadata(alias="modelId")] = pydantic.Field() """ - Model ID (optional) + Rime TTS model ID + """ + + base_url: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for the Rime streaming API """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/types/sarvam_asr.py b/src/agora_agent/types/sarvam_asr.py new file mode 100644 index 0000000..ec95847 --- /dev/null +++ b/src/agora_agent/types/sarvam_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .sarvam_asr_params import SarvamAsrParams + + +class SarvamAsr(UncheckedBaseModel): + """ + Sarvam ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: SarvamAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/sarvam_asr_params.py b/src/agora_agent/types/sarvam_asr_params.py new file mode 100644 index 0000000..f29769d --- /dev/null +++ b/src/agora_agent/types/sarvam_asr_params.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class SarvamAsrParams(UncheckedBaseModel): + """ + Sarvam ASR configuration parameters. + """ + + api_key: str = pydantic.Field() + """ + Sarvam API key + """ + + language: str = pydantic.Field() + """ + Language code for transcription. Set to unknown for automatic language detection. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/sarvam_tts_params.py b/src/agora_agent/types/sarvam_tts_params.py index 93457a4..855299f 100644 --- a/src/agora_agent/types/sarvam_tts_params.py +++ b/src/agora_agent/types/sarvam_tts_params.py @@ -5,6 +5,7 @@ import pydantic from ..core.pydantic_utilities import IS_PYDANTIC_V2 from ..core.unchecked_base_model import UncheckedBaseModel +from .sarvam_tts_params_target_language_code import SarvamTtsParamsTargetLanguageCode class SarvamTtsParams(UncheckedBaseModel): @@ -12,7 +13,7 @@ class SarvamTtsParams(UncheckedBaseModel): Sarvam TTS configuration parameters. """ - key: str = pydantic.Field() + api_subscription_key: str = pydantic.Field() """ Sarvam API subscription key """ @@ -22,11 +23,31 @@ class SarvamTtsParams(UncheckedBaseModel): Voice ID (e.g., anushka, abhilash, karun, hitesh, manisha, vidya, arya) """ - target_language_code: str = pydantic.Field() + target_language_code: SarvamTtsParamsTargetLanguageCode = pydantic.Field() """ Target language code (e.g., en-IN) """ + pitch: typing.Optional[float] = pydantic.Field(default=None) + """ + Pitch adjustment for the voice + """ + + pace: typing.Optional[float] = pydantic.Field(default=None) + """ + Speed of speech + """ + + loudness: typing.Optional[float] = pydantic.Field(default=None) + """ + Volume level of the speech + """ + + sample_rate: typing.Optional[float] = pydantic.Field(default=None) + """ + Audio sample rate in Hz + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/sarvam_tts_params_target_language_code.py b/src/agora_agent/types/sarvam_tts_params_target_language_code.py new file mode 100644 index 0000000..b1722ec --- /dev/null +++ b/src/agora_agent/types/sarvam_tts_params_target_language_code.py @@ -0,0 +1,8 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +SarvamTtsParamsTargetLanguageCode = typing.Union[ + typing.Literal["en-IN", "hi-IN", "bn-IN", "ta-IN", "te-IN", "kn-IN", "ml-IN", "mr-IN", "gu-IN", "pa-IN", "or-IN"], + typing.Any, +] diff --git a/src/agora_agent/types/speechmatics_asr.py b/src/agora_agent/types/speechmatics_asr.py new file mode 100644 index 0000000..644db25 --- /dev/null +++ b/src/agora_agent/types/speechmatics_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .speechmatics_asr_params import SpeechmaticsAsrParams + + +class SpeechmaticsAsr(UncheckedBaseModel): + """ + Speechmatics ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: SpeechmaticsAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/speechmatics_asr_params.py b/src/agora_agent/types/speechmatics_asr_params.py new file mode 100644 index 0000000..4709d22 --- /dev/null +++ b/src/agora_agent/types/speechmatics_asr_params.py @@ -0,0 +1,37 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class SpeechmaticsAsrParams(UncheckedBaseModel): + """ + Speechmatics ASR configuration parameters. + """ + + api_key: str = pydantic.Field() + """ + Speechmatics API key + """ + + language: str = pydantic.Field() + """ + Language code to use for transcription + """ + + uri: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for the Speechmatics streaming API + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/tests/custom/test_avatar_token.py b/tests/custom/test_avatar_token.py deleted file mode 100644 index fa73fc0..0000000 --- a/tests/custom/test_avatar_token.py +++ /dev/null @@ -1,12 +0,0 @@ -from agora_agent.agentkit import generate_convo_ai_token - - -def test_avatar_tokens_use_convo_ai_token_path_with_avatar_uid(): - token = generate_convo_ai_token( - app_id="0" * 32, - app_certificate="1" * 32, - channel_name="room", - uid=123, - ) - - assert token.startswith("007") diff --git a/tests/custom/test_llm_vendors.py b/tests/custom/test_llm_vendors.py deleted file mode 100644 index faca9bf..0000000 --- a/tests/custom/test_llm_vendors.py +++ /dev/null @@ -1,60 +0,0 @@ -from agora_agent import AmazonBedrock, CustomLLM, Dify, Groq, VertexAILLM - - -def test_groq_serializes_as_openai_compatible() -> None: - config = Groq(api_key="groq-key", model="llama-3.3-70b-versatile").to_config() - - assert config["url"] == "https://api.groq.com/openai/v1/chat/completions" - assert config["api_key"] == "groq-key" - assert config["style"] == "openai" - assert config["params"]["model"] == "llama-3.3-70b-versatile" - - -def test_custom_llm_marks_request_as_custom() -> None: - config = CustomLLM(api_key="key", model="model", base_url="https://llm.example.com/chat").to_config() - - assert config["url"] == "https://llm.example.com/chat" - assert config["api_key"] == "key" - assert config["vendor"] == "custom" - assert config["style"] == "openai" - - -def test_vertex_ai_llm_includes_project_routing() -> None: - config = VertexAILLM( - api_key="vertex-token", - model="gemini-2.0-flash", - project_id="project", - location="us-central1", - ).to_config() - - assert config["api_key"] == "vertex-token" - assert config["style"] == "gemini" - assert config["params"]["model"] == "gemini-2.0-flash" - assert config["params"]["project_id"] == "project" - assert config["params"]["location"] == "us-central1" - - -def test_amazon_bedrock_serializes_as_anthropic_style() -> None: - config = AmazonBedrock( - api_key="bedrock-key", - url="https://bedrock.example.com/messages", - model="anthropic.claude-3-5-sonnet-20241022-v2:0", - ).to_config() - - assert config["api_key"] == "bedrock-key" - assert config["style"] == "anthropic" - assert config["params"]["model"] == "anthropic.claude-3-5-sonnet-20241022-v2:0" - - -def test_dify_serializes_conversation_fields() -> None: - config = Dify( - api_key="dify-key", - url="https://api.dify.ai/v1/chat-messages", - user="user-1", - conversation_id="conversation-1", - ).to_config() - - assert config["api_key"] == "dify-key" - assert config["style"] == "dify" - assert config["params"]["user"] == "user-1" - assert config["params"]["conversation_id"] == "conversation-1" diff --git a/tests/custom/test_root_exports.py b/tests/custom/test_root_exports.py deleted file mode 100644 index 9b2f508..0000000 --- a/tests/custom/test_root_exports.py +++ /dev/null @@ -1,29 +0,0 @@ -import pytest - -import agora_agent -import agora_agent.agentkit as agentkit - - -def test_root_exports_match_agentkit_for_common_symbols() -> None: - for name in ("Agent", "DeepgramSTT", "OpenAI", "AgentPresets", "generate_rtc_token", "DataChannel"): - assert getattr(agora_agent, name) is getattr(agentkit, name) - - -def test_root_exports_fern_client_symbols() -> None: - assert agora_agent.Agora is not None - assert agora_agent.Area is not None - assert agora_agent.AsyncAgora is not None - - -def test_unknown_root_export_raises_attribute_error() -> None: - with pytest.raises(AttributeError): - _ = agora_agent.NotARealExportName - - -def test_dir_includes_agentkit_vendor_exports() -> None: - assert "DeepgramSTT" in dir(agora_agent) - - -def test_all_includes_agentkit_vendor_exports() -> None: - assert "DeepgramSTT" in agora_agent.__all__ - assert "OpenAI" in agora_agent.__all__ From 33f922917fb4c44ae98962a19083399bb5f1cb69 Mon Sep 17 00:00:00 2001 From: fern-api <115122769+fern-api[bot]@users.noreply.github.com> Date: Tue, 2 Jun 2026 07:03:56 +0000 Subject: [PATCH 19/26] [fern-replay] Applied customizations Patches applied (5): - patch-64703bda: test(agentkit): add custom tests for v1.5.0 AgentKit behavior - patch-7c2d9d99: feat(agentkit): align session options and token uid handling - patch-7465fada: fix(agentkit): resolve Python session typing issues - patch-fae1249a: Re-export agora-agents API from legacy PyPI compatibility package The compat distribution delegates to agora_agent via __getattr__ and documents both import paths in its README. - patch-44c21c14: Re-export AgentKit symbols from agora_agent package root Extend __getattr__ and __all__ so vendor classes, presets, and helpers are importable via `from agora_agent import ...`. Add tests and update class docstring examples to use the root import path. Patches with unresolved conflicts (17): - patch-6e30398b: chore(agentkit): bump to v1.5.0 and expose v2.7 type aliases - patch-9df782b4: feat(agentkit): update MLLM and LLM vendor wrappers for v2.7 - patch-26706d73: feat(agentkit): add GenericAvatar and session-aware avatar validation - patch-9f491c63: feat(agentkit): update Agent builder and session lifecycle for v2.7 - patch-6c20f076: docs(agentkit): update v1.5.0 guides, reference, and changelog - patch-eaec58eb: refactor(agentkit): align deprecated vendor aliases with canonical names - patch-20245632: feat(agentkit): export type aliases and avatar token helpers - patch-972dd5bd: updated docs - patch-4323b470: rename python package to agora-agents - patch-d29165c4: make python compat package publishable - patch-fc9d93c3: Document agora-agents PyPI install name and migration notes - patch-87fc4488: Update docs to import from agora_agent package root - patch-923cf954: Prioritize app credentials and builder in Python docs Rewrite getting-started auth and quick-start for app credentials with the builder API. De-emphasize presets and align index, BYOK, and README with the recommended onboarding path. - patch-d475306b: Move package rename guidance to installation docs and protect manual paths in Fern ignore. Consolidate migration notes into the installation guide with next-step links, add a brief README pointer, and exclude README, compat, and workflow files from Fern generation. - patch-c9355576: Streamline Python docs and README for app-credentials-first onboarding. Remove duplicated low-level client examples from the README, de-emphasize legacy auth modes, refocus the low-level API guide on AgentKit with telephony escape hatches, and update Agora-managed model terminology. - patch-98ecb4d3: Add Groq, Vertex AI, Bedrock, Dify, and Custom LLM vendor helpers. Introduce named LLM vendor classes with correct request serialization, export them from the package root, and add tests covering each provider's config shape. - patch-a5097b8d: Document new LLM vendors and tighten onboarding docs. Add Groq, Vertex AI, Bedrock, Dify, and Custom LLM to vendor references, simplify README and index navigation, and align quick-start and terminology with Agora-managed model language. Run `fern-replay resolve` to apply these customizations. Patches absorbed by generator (3): - patch-b7f0c36c: feat(agentkit): release v2.0.0 updates - patch-4d32368c: Add compat-build CI job and harden dual-package PyPI publish Build and verify the compat wheel re-exports, gate publish on compat-build, simplify version checks with poetry version, wait for primary package on PyPI, and retry compat publish on failure. - patch-20109390: Fix PyPI publish auth and explicitly protect release workflow in Fern ignore. Use PYPI_API_TOKEN for primary and compat Poetry publishes, matching the v1.4.1 release flow, and list release.yml explicitly in .fernignore. The generator now produces these customizations natively. --- .fern/replay.lock | 12088 ++++++++++++++++++- src/agora_agent/agentkit/agent.py | 2 + src/agora_agent/agentkit/agent_session.py | 1 + src/agora_agent/agentkit/vendors/avatar.py | 43 + src/agora_agent/agentkit/vendors/mllm.py | 1 + tests/custom/test_agentkit_agent.py | 298 + tests/custom/test_agentkit_session.py | 383 + tests/custom/test_agentkit_vendors.py | 122 + tests/custom/test_avatar_token.py | 12 + tests/custom/test_llm_vendors.py | 60 + tests/custom/test_root_exports.py | 29 + 11 files changed, 13037 insertions(+), 2 deletions(-) create mode 100644 tests/custom/test_agentkit_agent.py create mode 100644 tests/custom/test_agentkit_session.py create mode 100644 tests/custom/test_agentkit_vendors.py create mode 100644 tests/custom/test_avatar_token.py create mode 100644 tests/custom/test_llm_vendors.py create mode 100644 tests/custom/test_root_exports.py diff --git a/.fern/replay.lock b/.fern/replay.lock index 536b6e4..2e0fcdc 100644 --- a/.fern/replay.lock +++ b/.fern/replay.lock @@ -6,5 +6,12089 @@ generations: timestamp: 2026-05-20T20:38:02.180Z cli_version: unknown generator_versions: {} -current_generation: a217c8ecfd919345831eebaca8295e292d65ebcf -patches: [] + - commit_sha: 403a1a9ac18fd05139ac0140d1837d28f5805dcb + tree_hash: 594698fa5d5253002a6ee764b8a87926b169fb39 + timestamp: 2026-06-02T07:03:42.322Z + cli_version: unknown + generator_versions: + fernapi/fern-python-sdk: 4.37.0 +current_generation: 403a1a9ac18fd05139ac0140d1837d28f5805dcb +patches: + - id: patch-6e30398b + content_hash: sha256:e99898e508e2d6cb9f134cc33e0b73c1c8acb845f5887924e0e38031a6e089c0 + original_commit: 6e30398b5dc6e8ff2681a442a4d6a49c7d866032 + original_message: "chore(agentkit): bump to v1.5.0 and expose v2.7 type aliases" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/__init__.py + - src/agora_agent/agentkit/vendors/__init__.py + patch_content: | + diff --git a/src/agora_agent/agentkit/__init__.py b/src/agora_agent/agentkit/__init__.py + index 1942bce..5ceda66 100644 + --- a/src/agora_agent/agentkit/__init__.py + +++ b/src/agora_agent/agentkit/__init__.py + @@ -1,13 +1,30 @@ + from .agent import ( + Agent, + + AgentConfig, + + AgentConfigUpdate, + + ConversationHistory, + + ConversationRole, + + ConversationSessionTurn, + + ConversationTurn, + + ConversationTurns, + StartAgentsRequestProperties, + + AvatarConfig, + + AvatarVendor, + GeofenceConfig, + + LlmConfig, + + LlmStyle, + + MllmConfig, + + MllmVendor, + RtcConfig, + + SttConfig, + + SttVendor, + + TtsConfig, + FillerWordsConfig, + FillerWordsTrigger, + FillerWordsTriggerFixedTimeConfig, + FillerWordsContent, + FillerWordsContentStaticConfig, + + FillerWordsContentSelectionRule, + TurnDetectionConfig, + TurnDetectionNestedConfig, + StartOfSpeechConfig, + @@ -37,9 +54,14 @@ from .agent import ( + InterruptionMode, + MllmTurnDetectionConfig, + MllmTurnDetectionMode, + + Labels, + LlmGreetingConfigs, + LlmGreetingConfigsMode, + McpServersItem, + + SessionInfo, + + SessionListResponse, + + SessionSummary, + + SpeakPriority, + ) + from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from ..agent_management.types.agent_think_agent_management_response import ( + @@ -57,8 +79,10 @@ from ..agent_management.types.agent_think_agent_management_request_on_speaking_a + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + @@ -112,6 +136,7 @@ from .vendors import ( + FishAudioTTS, + Gemini, + GeminiLive, + + GenericAvatar, + GoogleSTT, + GoogleTTS, + HeyGenAvatar, + @@ -132,14 +157,27 @@ from .vendors import ( + SarvamTTS, + SpeechmaticsSTT, + VertexAI, + + XaiGrok, + + XaiRealtime, + LiveAvatarAvatar, + ) + + __all__ = [ + "Agent", + + "AgentConfig", + + "AgentConfigUpdate", + # Return type of Agent.to_properties() + "StartAgentsRequestProperties", + # Top-level config types + + "LlmConfig", + + "LlmStyle", + + "SttConfig", + + "SttVendor", + + "TtsConfig", + + "MllmConfig", + + "MllmVendor", + + "AvatarConfig", + + "AvatarVendor", + "GeofenceConfig", + "RtcConfig", + "FillerWordsConfig", + @@ -147,6 +185,7 @@ __all__ = [ + "FillerWordsTriggerFixedTimeConfig", + "FillerWordsContent", + "FillerWordsContentStaticConfig", + + "FillerWordsContentSelectionRule", + # Turn detection types + "TurnDetectionConfig", + "TurnDetectionNestedConfig", + @@ -181,6 +220,7 @@ __all__ = [ + "InterruptionMode", + "MllmTurnDetectionConfig", + "MllmTurnDetectionMode", + + "Labels", + # Type-safe constants + "DataChannel", + "AudioScenario", + @@ -197,6 +237,15 @@ __all__ = [ + "AgentSession", + "AsyncAgentSession", + "AgentSessionOptions", + + "SessionInfo", + + "SessionListResponse", + + "SessionSummary", + + "ConversationHistory", + + "ConversationTurn", + + "ConversationRole", + + "ConversationTurns", + + "ConversationSessionTurn", + + "SpeakPriority", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + @@ -253,14 +302,19 @@ __all__ = [ + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + + "XaiGrok", + + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + + "GenericAvatar", + "is_heygen_avatar", + "is_live_avatar_avatar", + "is_akool_avatar", + "is_anam_avatar", + + "is_generic_avatar", + + "is_rtc_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + diff --git a/src/agora_agent/agentkit/vendors/__init__.py b/src/agora_agent/agentkit/vendors/__init__.py + index 0320843..689eab1 100644 + --- a/src/agora_agent/agentkit/vendors/__init__.py + +++ b/src/agora_agent/agentkit/vendors/__init__.py + @@ -11,9 +11,9 @@ from .base import ( + OpenAISampleRate, + SampleRate, + ) + -from .avatar import AkoolAvatar, AnamAvatar, HeyGenAvatar, LiveAvatarAvatar + +from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + -from .mllm import GeminiLive, OpenAIRealtime, VertexAI + +from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok, XaiRealtime + from .stt import ( + AmazonSTT, + AresSTT, + @@ -82,8 +82,11 @@ __all__ = [ + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + + "XaiGrok", + + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + + "GenericAvatar", + ] + theirs_snapshot: + src/agora_agent/agentkit/__init__.py: | + from .agent import ( + Agent, + AgentConfig, + AgentConfigUpdate, + ConversationHistory, + ConversationRole, + ConversationSessionTurn, + ConversationTurn, + ConversationTurns, + StartAgentsRequestProperties, + AvatarConfig, + AvatarVendor, + GeofenceConfig, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, + RtcConfig, + SttConfig, + SttVendor, + TtsConfig, + FillerWordsConfig, + FillerWordsTrigger, + FillerWordsTriggerFixedTimeConfig, + FillerWordsContent, + FillerWordsContentStaticConfig, + FillerWordsContentSelectionRule, + TurnDetectionConfig, + TurnDetectionNestedConfig, + StartOfSpeechConfig, + StartOfSpeechMode, + StartOfSpeechVadConfig, + StartOfSpeechKeywordsConfig, + StartOfSpeechDisabledConfig, + StartOfSpeechDisabledConfigStrategy, + EndOfSpeechConfig, + EndOfSpeechMode, + EndOfSpeechVadConfig, + EndOfSpeechSemanticConfig, + TurnDetectionType, + InterruptMode, + Eagerness, + SalConfig, + SalMode, + AdvancedFeatures, + SessionParams, + SessionParamsInput, + SilenceConfig, + SilenceAction, + FarewellConfig, + ParametersDataChannel, + ParametersAudioScenario, + InterruptionConfig, + InterruptionMode, + MllmTurnDetectionConfig, + MllmTurnDetectionMode, + Labels, + LlmGreetingConfigs, + LlmGreetingConfigsMode, + McpServersItem, + SessionInfo, + SessionListResponse, + SessionSummary, + SpeakPriority, + ) + from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .constants import ( + DataChannel, + AudioScenario, + SilenceActionValues, + SalModeValues, + GeofenceArea, + GeofenceExcludeArea, + FillerWordsSelectionRule, + TurnDetectionTypeValues, + ) + from .token import ( + GenerateConvoAITokenOptions, + GenerateTokenOptions, + MAX_EXPIRY_SECONDS, + generate_convo_ai_token, + generate_rtc_token, + expires_in_hours, + expires_in_minutes, + ) + from .presets import ( + AgentPresets, + DeepgramPresetModels, + MiniMaxPresetModels, + OpenAIPresetModels, + OpenAITtsPresetModels, + normalize_preset_input, + ) + from .vendors import ( + AkoolAvatar, + AmazonSTT, + AmazonTTS, + AnamAvatar, + Anthropic, + AresSTT, + AssemblyAISTT, + AzureOpenAI, + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + CartesiaTTS, + DeepgramSTT, + DeepgramTTS, + ElevenLabsSampleRate, + ElevenLabsTTS, + FishAudioTTS, + Gemini, + GeminiLive, + GenericAvatar, + GoogleSTT, + GoogleTTS, + HeyGenAvatar, + HumeAITTS, + MicrosoftSampleRate, + MicrosoftSTT, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAI, + OpenAIRealtime, + OpenAISampleRate, + OpenAISTT, + OpenAITTS, + RimeTTS, + SampleRate, + SarvamSTT, + SarvamTTS, + SpeechmaticsSTT, + VertexAI, + XaiGrok, + XaiRealtime, + LiveAvatarAvatar, + ) + + __all__ = [ + "Agent", + "AgentConfig", + "AgentConfigUpdate", + # Return type of Agent.to_properties() + "StartAgentsRequestProperties", + # Top-level config types + "LlmConfig", + "LlmStyle", + "SttConfig", + "SttVendor", + "TtsConfig", + "MllmConfig", + "MllmVendor", + "AvatarConfig", + "AvatarVendor", + "GeofenceConfig", + "RtcConfig", + "FillerWordsConfig", + "FillerWordsTrigger", + "FillerWordsTriggerFixedTimeConfig", + "FillerWordsContent", + "FillerWordsContentStaticConfig", + "FillerWordsContentSelectionRule", + # Turn detection types + "TurnDetectionConfig", + "TurnDetectionNestedConfig", + "StartOfSpeechConfig", + "StartOfSpeechMode", + "StartOfSpeechVadConfig", + "StartOfSpeechKeywordsConfig", + "StartOfSpeechDisabledConfig", + "StartOfSpeechDisabledConfigStrategy", + "EndOfSpeechConfig", + "EndOfSpeechMode", + "EndOfSpeechVadConfig", + "EndOfSpeechSemanticConfig", + # Deprecated turn detection types + "TurnDetectionType", + "InterruptMode", + "Eagerness", + # SAL types + "SalConfig", + "SalMode", + # Advanced features + "AdvancedFeatures", + # Session parameters types + "SessionParams", + "SessionParamsInput", + "SilenceConfig", + "SilenceAction", + "FarewellConfig", + "ParametersDataChannel", + "ParametersAudioScenario", + "InterruptionConfig", + "InterruptionMode", + "MllmTurnDetectionConfig", + "MllmTurnDetectionMode", + "Labels", + # Type-safe constants + "DataChannel", + "AudioScenario", + "SilenceActionValues", + "SalModeValues", + "GeofenceArea", + "GeofenceExcludeArea", + "FillerWordsSelectionRule", + "TurnDetectionTypeValues", + # LLM sub-types + "LlmGreetingConfigs", + "LlmGreetingConfigsMode", + "McpServersItem", + "AgentSession", + "AsyncAgentSession", + "AgentSessionOptions", + "SessionInfo", + "SessionListResponse", + "SessionSummary", + "ConversationHistory", + "ConversationTurn", + "ConversationRole", + "ConversationTurns", + "ConversationSessionTurn", + "SpeakPriority", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + "AgentThinkRequestOnSpeakingAction", + "AgentPresets", + "DeepgramPresetModels", + "OpenAIPresetModels", + "OpenAITtsPresetModels", + "MiniMaxPresetModels", + "normalize_preset_input", + "generate_rtc_token", + "GenerateTokenOptions", + "generate_convo_ai_token", + "GenerateConvoAITokenOptions", + "MAX_EXPIRY_SECONDS", + "expires_in_hours", + "expires_in_minutes", + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + "is_heygen_avatar", + "is_live_avatar_avatar", + "is_akool_avatar", + "is_anam_avatar", + "is_generic_avatar", + "is_rtc_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + src/agora_agent/agentkit/vendors/__init__.py: | + from .base import ( + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + ElevenLabsSampleRate, + GoogleTTSSampleRate, + MicrosoftSampleRate, + OpenAISampleRate, + SampleRate, + ) + from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok, XaiRealtime + from .stt import ( + AmazonSTT, + AresSTT, + AssemblyAISTT, + DeepgramSTT, + GoogleSTT, + MicrosoftSTT, + OpenAISTT, + SarvamSTT, + SpeechmaticsSTT, + ) + from .tts import ( + AmazonTTS, + CartesiaTTS, + DeepgramTTS, + ElevenLabsTTS, + FishAudioTTS, + GoogleTTS, + HumeAITTS, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAITTS, + RimeTTS, + SarvamTTS, + ) + + __all__ = [ + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "GoogleTTSSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + ] + status: unresolved + - id: patch-9df782b4 + content_hash: sha256:84c08fe3239d2ecb0b0a3ddd33b0dce4e7b012125be797aa83ca12893363b565 + original_commit: 9df782b46d872599f103078e30c5ded2053f2517 + original_message: "feat(agentkit): update MLLM and LLM vendor wrappers for v2.7" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/vendors/mllm.py + patch_content: |+ + From 9df782b46d872599f103078e30c5ded2053f2517 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 20 May 2026 20:57:54 -0400 + Subject: [PATCH] feat(agentkit): update MLLM and LLM vendor wrappers for v2.7 + + Adds xAI Grok Realtime and Vertex AI MLLM wrappers, and aligns MLLM + config serialization with the generated core types. LLM vendors now + accept typed greeting_configs and serialize them through the generated + model shape, including interruptable. + --- + src/agora_agent/agentkit/vendors/llm.py | 31 ++++-- + src/agora_agent/agentkit/vendors/mllm.py | 118 +++++++++++++++++------ + 2 files changed, 113 insertions(+), 36 deletions(-) + + diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py + index 7465c9f..6f74b43 100644 + --- a/src/agora_agent/agentkit/vendors/llm.py + +++ b/src/agora_agent/agentkit/vendors/llm.py + @@ -1,9 +1,14 @@ + -from typing import Any, Dict, List, Optional + +from typing import Any, Dict, List, Optional, Union + + from pydantic import BaseModel, ConfigDict, Field + + +from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + + StartAgentsRequestPropertiesLlmGreetingConfigs, + +) + from .base import BaseLLM + + +LlmGreetingConfigs = Union[StartAgentsRequestPropertiesLlmGreetingConfigs, Dict[str, Any]] + + + + def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Ensure each MCP server has transport set (API requires it). Default to streamable_http.""" + @@ -15,6 +20,14 @@ def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]] + result.append(item) + return result + + + + +def _dump_optional_model(value: Any) -> Any: + + if hasattr(value, "model_dump"): + + return value.model_dump(exclude_none=True) + + if hasattr(value, "dict"): + + return value.dict(exclude_none=True) + + return value + + + class OpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + @@ -31,7 +44,7 @@ class OpenAIOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -74,7 +87,7 @@ class OpenAI(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + @@ -104,7 +117,7 @@ class AzureOpenAIOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -150,7 +163,7 @@ class AzureOpenAI(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.mcp_servers is not None: + @@ -177,7 +190,7 @@ class AnthropicOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -216,7 +229,7 @@ class Anthropic(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + @@ -246,7 +259,7 @@ class GeminiOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -287,7 +300,7 @@ class Gemini(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py + index 5f6f940..cd6cd07 100644 + --- a/src/agora_agent/agentkit/vendors/mllm.py + +++ b/src/agora_agent/agentkit/vendors/mllm.py + @@ -1,3 +1,4 @@ + +import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + @@ -22,9 +23,7 @@ class OpenAIRealtimeOptions(BaseModel): + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + @@ -53,18 +52,97 @@ class OpenAIRealtime(BaseMLLM): + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + - if self.options.predefined_tools is not None: + - config["predefined_tools"] = self.options.predefined_tools + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + - if self.options.max_history is not None: + - config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + +# xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + +# is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + + + +class XaiGrokOptions(BaseModel): + + model_config = ConfigDict(extra="forbid") + + + + api_key: str = Field(..., description="xAI API key") + + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + + + +class XaiGrok(BaseMLLM): + + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + + + def __init__(self, **kwargs: Any): + + self.options = XaiGrokOptions(**kwargs) + + + + def to_config(self) -> Dict[str, Any]: + + params: Dict[str, Any] = dict(self.options.params or {}) + + if self.options.voice is not None: + + params["voice"] = self.options.voice + + if self.options.language is not None: + + params["language"] = self.options.language + + if self.options.sample_rate is not None: + + params["sample_rate"] = self.options.sample_rate + + + + config: Dict[str, Any] = { + + "vendor": "xai", + + "api_key": self.options.api_key, + + "url": self.options.url, + + "params": params, + + } + + + + if self.options.greeting_message is not None: + + config["greeting_message"] = self.options.greeting_message + + if self.options.input_modalities is not None: + + config["input_modalities"] = self.options.input_modalities + + if self.options.output_modalities is not None: + + config["output_modalities"] = self.options.output_modalities + + if self.options.messages is not None: + + config["messages"] = self.options.messages + + if self.options.failure_message is not None: + + config["failure_message"] = self.options.failure_message + + if self.options.turn_detection is not None: + + config["turn_detection"] = self.options.turn_detection + + + + return config + + + + + +class XaiRealtimeOptions(XaiGrokOptions): + + """Deprecated: use :class:`XaiGrokOptions` instead.""" + + + + def __init__(self, **data: Any): + + warnings.warn( + + "XaiRealtimeOptions is deprecated; use XaiGrokOptions instead.", + + DeprecationWarning, + + stacklevel=2, + + ) + + super().__init__(**data) + + + + + +class XaiRealtime(XaiGrok): + + """Deprecated: use :class:`XaiGrok` instead.""" + + + + def __init__(self, **kwargs: Any): + + warnings.warn( + + "XaiRealtime is deprecated; use XaiGrok instead.", + + DeprecationWarning, + + stacklevel=2, + + ) + + super().__init__(**kwargs) + + + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + @@ -81,28 +159,24 @@ class VertexAIOptions(BaseModel): + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + - params: Dict[str, Any] = { + - "model": self.options.model, + - "project_id": self.options.project_id, + - "location": self.options.location, + - "adc_credentials_string": self.options.adc_credentials_string, + - } + - + + # additional_params spread first so that explicit fields always win, + + # matching the TypeScript SDK. + + params: Dict[str, Any] = dict(self.options.additional_params or {}) + + params["model"] = self.options.model + + params["project_id"] = self.options.project_id + + params["location"] = self.options.location + + params["adc_credentials_string"] = self.options.adc_credentials_string + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + - if self.options.additional_params is not None: + - params.update(self.options.additional_params) + + config: Dict[str, Any] = { + "vendor": "vertexai", + @@ -119,12 +193,8 @@ class VertexAI(BaseMLLM): + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + - if self.options.predefined_tools is not None: + - config["predefined_tools"] = self.options.predefined_tools + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + - if self.options.max_history is not None: + - config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + @@ -145,9 +215,7 @@ class GeminiLiveOptions(BaseModel): + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + @@ -179,12 +247,8 @@ class GeminiLive(BaseMLLM): + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + - if self.options.predefined_tools is not None: + - config["predefined_tools"] = self.options.predefined_tools + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + - if self.options.max_history is not None: + - config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/vendors/llm.py: | + from typing import Any, Dict, List, Optional, Union + + from pydantic import BaseModel, ConfigDict, Field + + from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + StartAgentsRequestPropertiesLlmGreetingConfigs, + ) + from .base import BaseLLM + + LlmGreetingConfigs = Union[StartAgentsRequestPropertiesLlmGreetingConfigs, Dict[str, Any]] + + + def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Ensure each MCP server has transport set (API requires it). Default to streamable_http.""" + result = [] + for s in servers: + item = dict(s) + if item.get("transport") is None: + item["transport"] = "streamable_http" + result.append(item) + return result + + + def _dump_optional_model(value: Any) -> Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + class OpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: Optional[str] = Field(default=None, description="OpenAI API key") + model: str = Field(default="gpt-4o-mini", description="Model name") + base_url: Optional[str] = Field(default=None, description="Custom base URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + max_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class OpenAI(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # model is the default; explicit params entries extend/override it. + # This matches the TS SDK behaviour: { model, ...params }. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + + # Named fields take precedence over anything in the generic params dict. + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.base_url or "https://api.openai.com/v1/chat/completions", + "params": params, + "style": "openai", + "input_modalities": self.options.input_modalities or ["text"], + } + if self.options.api_key is not None: + config["api_key"] = self.options.api_key + if self.options.headers is not None: + config["headers"] = self.options.headers + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class AzureOpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Azure OpenAI API key") + endpoint: str = Field(..., description="Azure endpoint URL") + deployment_name: str = Field(..., description="Azure deployment name") + api_version: str = Field(default="2024-08-01-preview", description="Azure API version") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + max_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class AzureOpenAI(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AzureOpenAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + url = ( + f"{self.options.endpoint}/openai/deployments/" + f"{self.options.deployment_name}/chat/completions" + f"?api-version={self.options.api_version}" + ) + config: Dict[str, Any] = { + "url": url, + "api_key": self.options.api_key, + "vendor": self.options.vendor or "azure", + "style": "openai", + "input_modalities": self.options.input_modalities or ["text"], + } + + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if params: + config["params"] = params + if self.options.headers is not None: + config["headers"] = self.options.headers + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class AnthropicOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anthropic API key") + model: str = Field(default="claude-3-5-sonnet-20241022", description="Model name") + url: Optional[str] = Field(default=None, description="Custom API endpoint URL") + max_tokens: Optional[int] = Field(default=None, gt=0) + temperature: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class Anthropic(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AnthropicOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.url or "https://api.anthropic.com/v1/messages", + "api_key": self.options.api_key, + "params": params, + "style": "anthropic", + "input_modalities": self.options.input_modalities or ["text"], + } + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class GeminiOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google AI API key") + model: str = Field(default="gemini-2.0-flash-exp", description="Model name") + url: Optional[str] = Field(default=None, description="Custom API endpoint URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_k: Optional[int] = Field(default=None, gt=0) + max_output_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class Gemini(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + if self.options.top_k is not None: + params["top_k"] = self.options.top_k + if self.options.max_output_tokens is not None: + params["max_output_tokens"] = self.options.max_output_tokens + + config: Dict[str, Any] = { + "url": self.options.url or "https://generativelanguage.googleapis.com/v1beta/models", + "api_key": self.options.api_key, + "params": params, + "style": "gemini", + "input_modalities": self.options.input_modalities or ["text"], + } + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + src/agora_agent/agentkit/vendors/mllm.py: | + import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + + from ...agents.types.start_agents_request_properties_mllm_turn_detection import ( + StartAgentsRequestPropertiesMllmTurnDetection, + ) + from .base import BaseMLLM + + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + + + class OpenAIRealtimeOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="OpenAI API key") + model: Optional[str] = Field(default=None, description="Model name (e.g., gpt-4o-realtime-preview)") + url: Optional[str] = Field(default=None, description="WebSocket URL") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIRealtimeOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config: Dict[str, Any] = { + "vendor": "openai", + "api_key": self.options.api_key, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.model is not None: + params = {"model": self.options.model} + if self.options.params is not None: + params.update(self.options.params) + config["params"] = params + elif self.options.params is not None: + config["params"] = self.options.params + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + # xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + # is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + class XaiGrokOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="xAI API key") + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + class XaiGrok(BaseMLLM): + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + def __init__(self, **kwargs: Any): + self.options = XaiGrokOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.language is not None: + params["language"] = self.options.language + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate + + config: Dict[str, Any] = { + "vendor": "xai", + "api_key": self.options.api_key, + "url": self.options.url, + "params": params, + } + + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class XaiRealtimeOptions(XaiGrokOptions): + """Deprecated: use :class:`XaiGrokOptions` instead.""" + + def __init__(self, **data: Any): + warnings.warn( + "XaiRealtimeOptions is deprecated; use XaiGrokOptions instead.", + DeprecationWarning, + stacklevel=2, + ) + super().__init__(**data) + + + class XaiRealtime(XaiGrok): + """Deprecated: use :class:`XaiGrok` instead.""" + + def __init__(self, **kwargs: Any): + warnings.warn( + "XaiRealtime is deprecated; use XaiGrok instead.", + DeprecationWarning, + stacklevel=2, + ) + super().__init__(**kwargs) + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + model: str = Field(..., description="Model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud location/region") + adc_credentials_string: str = Field(..., description="Application Default Credentials JSON string") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name (e.g., Aoede, Charon)") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # additional_params spread first so that explicit fields always win, + # matching the TypeScript SDK. + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["model"] = self.options.model + params["project_id"] = self.options.project_id + params["location"] = self.options.location + params["adc_credentials_string"] = self.options.adc_credentials_string + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "vertexai", + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class GeminiLiveOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google API key") + model: str = Field(..., description="Gemini Live model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiLiveOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = {} + if self.options.additional_params is not None: + params.update(self.options.additional_params) + params["model"] = self.options.model + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "gemini", + "api_key": self.options.api_key, + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + status: unresolved + - id: patch-26706d73 + content_hash: sha256:a9551e0b774b96e7734e9faa7d770611861cf443837428272ef75710447238da + original_commit: 26706d73ae15d860d57daf926837632c01be7f10 + original_message: "feat(agentkit): add GenericAvatar and session-aware avatar validation" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/avatar_types.py + - src/agora_agent/agentkit/vendors/avatar.py + patch_content: |+ + From 26706d73ae15d860d57daf926837632c01be7f10 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 20 May 2026 20:59:22 -0400 + Subject: [PATCH] feat(agentkit): add GenericAvatar and session-aware avatar + validation + + Adds the GenericAvatar vendor wrapper and extends avatar validation + helpers for generic and RTC-backed avatars. Session-derived fields such + as agora_appid, agora_channel, and agora_token can now be validated + after AgentSession enrichment. + --- + src/agora_agent/agentkit/avatar_types.py | 35 +++++++++++++++++- + src/agora_agent/agentkit/vendors/avatar.py | 42 ++++++++++++++++++++++ + 2 files changed, 76 insertions(+), 1 deletion(-) + + diff --git a/src/agora_agent/agentkit/avatar_types.py b/src/agora_agent/agentkit/avatar_types.py + index 9e132a9..a04809c 100644 + --- a/src/agora_agent/agentkit/avatar_types.py + +++ b/src/agora_agent/agentkit/avatar_types.py + @@ -17,7 +17,21 @@ def is_anam_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "anam" + + + -def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: + +def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + + return config.get("vendor") == "generic" + + + + + +def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + + params = config.get("params", {}) + + return isinstance(params, dict) and bool(params.get("agora_uid")) and ( + + is_heygen_avatar(config) or is_live_avatar_avatar(config) or is_generic_avatar(config) + + ) + + + + + +def validate_avatar_config( + + config: typing.Dict[str, typing.Any], + + require_session_fields: bool = False, + +) -> None: + """Validates avatar configuration at runtime. + + Parameters + @@ -45,6 +59,8 @@ def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: + f"Invalid quality for {label}: {params.get('quality')}. " + f"Must be one of: {', '.join(valid_qualities)}" + ) + + if require_session_fields and not params.get("agora_token"): + + raise ValueError(f"{label} avatar requires agora_token after session enrichment") + elif is_akool_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + @@ -53,6 +69,23 @@ def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Anam avatar requires api_key") + + elif is_generic_avatar(config): + + params = config.get("params", {}) + + if not params.get("api_key"): + + raise ValueError("Generic avatar requires api_key") + + if not params.get("api_base_url"): + + raise ValueError("Generic avatar requires api_base_url") + + if not params.get("avatar_id"): + + raise ValueError("Generic avatar requires avatar_id") + + if not params.get("agora_uid"): + + raise ValueError("Generic avatar requires agora_uid") + + if require_session_fields: + + if not params.get("agora_token"): + + raise ValueError("Generic avatar requires agora_token after session enrichment") + + if not params.get("agora_appid"): + + raise ValueError("Generic avatar requires agora_appid after session enrichment") + + if not params.get("agora_channel"): + + raise ValueError("Generic avatar requires agora_channel after session enrichment") + + + def validate_tts_sample_rate( + diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py + index b83a356..00cad8f 100644 + --- a/src/agora_agent/agentkit/vendors/avatar.py + +++ b/src/agora_agent/agentkit/vendors/avatar.py + @@ -132,6 +132,48 @@ class LiveAvatarAvatar(BaseAvatar): + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + +class GenericAvatarOptions(BaseModel): + + model_config = ConfigDict(extra="forbid") + + + + api_key: str = Field(..., description="Generic avatar provider API key") + + api_base_url: str = Field(..., description="Avatar provider API base URL") + + avatar_id: str = Field(..., description="Avatar ID") + + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + +class GenericAvatar(BaseAvatar): + + def __init__(self, **kwargs: Any): + + self.options = GenericAvatarOptions(**kwargs) + + + + @property + + def required_sample_rate(self) -> int: + + return 0 + + + + def to_config(self) -> Dict[str, Any]: + + params: Dict[str, Any] = { + + "api_key": self.options.api_key, + + "api_base_url": self.options.api_base_url, + + "avatar_id": self.options.avatar_id, + + "agora_uid": self.options.agora_uid, + + } + + + + if self.options.agora_appid is not None: + + params["agora_appid"] = self.options.agora_appid + + if self.options.agora_token is not None: + + params["agora_token"] = self.options.agora_token + + if self.options.agora_channel is not None: + + params["agora_channel"] = self.options.agora_channel + + if self.options.additional_params is not None: + + params = {**self.options.additional_params, **params} + + + + enable = self.options.enable if self.options.enable is not None else True + + return {"enable": enable, "vendor": "generic", "params": params} + + + + + class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/avatar_types.py: | + import typing + + + def is_heygen_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "heygen" + + + def is_live_avatar_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "liveavatar" + + + def is_akool_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "akool" + + + def is_anam_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "anam" + + + def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "generic" + + + def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + params = config.get("params", {}) + return isinstance(params, dict) and bool(params.get("agora_uid")) and ( + is_heygen_avatar(config) or is_live_avatar_avatar(config) or is_generic_avatar(config) + ) + + + def validate_avatar_config( + config: typing.Dict[str, typing.Any], + require_session_fields: bool = False, + ) -> None: + """Validates avatar configuration at runtime. + + Parameters + ---------- + config : dict + The avatar configuration dictionary. + + Raises + ------ + ValueError + If the configuration is invalid. + """ + if is_heygen_avatar(config) or is_live_avatar_avatar(config): + label = "HeyGen" if is_heygen_avatar(config) else "LiveAvatar" + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError(f"{label} avatar requires api_key") + if not params.get("quality"): + raise ValueError(f"{label} avatar requires quality (low, medium, or high)") + if not params.get("agora_uid"): + raise ValueError(f"{label} avatar requires agora_uid") + valid_qualities = ("low", "medium", "high") + if params.get("quality") not in valid_qualities: + raise ValueError( + f"Invalid quality for {label}: {params.get('quality')}. " + f"Must be one of: {', '.join(valid_qualities)}" + ) + if require_session_fields and not params.get("agora_token"): + raise ValueError(f"{label} avatar requires agora_token after session enrichment") + elif is_akool_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Akool avatar requires api_key") + elif is_anam_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Anam avatar requires api_key") + elif is_generic_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Generic avatar requires api_key") + if not params.get("api_base_url"): + raise ValueError("Generic avatar requires api_base_url") + if not params.get("avatar_id"): + raise ValueError("Generic avatar requires avatar_id") + if not params.get("agora_uid"): + raise ValueError("Generic avatar requires agora_uid") + if require_session_fields: + if not params.get("agora_token"): + raise ValueError("Generic avatar requires agora_token after session enrichment") + if not params.get("agora_appid"): + raise ValueError("Generic avatar requires agora_appid after session enrichment") + if not params.get("agora_channel"): + raise ValueError("Generic avatar requires agora_channel after session enrichment") + + + def validate_tts_sample_rate( + avatar_config: typing.Dict[str, typing.Any], + tts_sample_rate: int, + ) -> None: + """Validates that TTS sample rate is compatible with the avatar vendor. + + Different avatar vendors have specific sample rate requirements: + - HeyGen: ONLY supports 24,000 Hz + - Akool: ONLY supports 16,000 Hz + + Parameters + ---------- + avatar_config : dict + The avatar configuration dictionary. + tts_sample_rate : int + The sample rate from your TTS configuration (in Hz). + + Raises + ------ + ValueError + If TTS sample rate is incompatible with the avatar vendor. + """ + if is_heygen_avatar(avatar_config) or is_live_avatar_avatar(avatar_config): + if tts_sample_rate != 24000: + label = "HeyGen" if is_heygen_avatar(avatar_config) else "LiveAvatar" + raise ValueError( + f"{label} avatars ONLY support 24,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 24kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/overview" + ) + elif is_akool_avatar(avatar_config): + if tts_sample_rate != 16000: + raise ValueError( + f"Akool avatars ONLY support 16,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 16kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/akool" + ) + src/agora_agent/agentkit/vendors/avatar.py: | + import warnings + from typing import Any, Dict, Optional + + from pydantic import BaseModel, ConfigDict, Field, field_validator + + from .base import BaseAvatar + + HEYGEN_SAMPLE_RATE = 24000 + LIVEAVATAR_SAMPLE_RATE = 24000 + AKOOL_SAMPLE_RATE = 16000 + + + class HeyGenAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="HeyGen API key") + quality: str = Field(..., description="Avatar quality: low, medium, or high") + agora_uid: str = Field(..., description="Agora UID for the avatar stream") + agora_token: Optional[str] = Field(default=None, description="RTC token for avatar authentication") + avatar_id: Optional[str] = Field(default=None, description="HeyGen avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + disable_idle_timeout: Optional[bool] = Field(default=None, description="Whether to disable idle timeout") + activity_idle_timeout: Optional[int] = Field(default=None, description="Idle timeout in seconds") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + @field_validator("quality") + @classmethod + def validate_quality(cls, v: str) -> str: + valid = ("low", "medium", "high") + if v not in valid: + raise ValueError(f"Invalid quality '{v}'. Must be one of: {', '.join(valid)}") + return v + + class HeyGenAvatar(BaseAvatar): + """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + def __init__(self, **kwargs: Any): + warnings.warn( + "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + DeprecationWarning, + stacklevel=2, + ) + self.options = HeyGenAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return HEYGEN_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "heygen", "params": params} + + + class AkoolAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Akool API key") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + class AkoolAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AkoolAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return AKOOL_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "akool", "params": params} + + + class LiveAvatarAvatarOptions(HeyGenAvatarOptions): + pass + + + class LiveAvatarAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = LiveAvatarAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return LIVEAVATAR_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + class GenericAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Generic avatar provider API key") + api_base_url: str = Field(..., description="Avatar provider API base URL") + avatar_id: str = Field(..., description="Avatar ID") + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "api_base_url": self.options.api_base_url, + "avatar_id": self.options.avatar_id, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_appid is not None: + params["agora_appid"] = self.options.agora_appid + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.agora_channel is not None: + params["agora_channel"] = self.options.agora_channel + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "generic", "params": params} + + + class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anam API key") + persona_id: Optional[str] = Field(default=None, description="Persona ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + class AnamAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AnamAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.persona_id is not None: + params["persona_id"] = self.options.persona_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "anam", "params": params} + status: unresolved + - id: patch-9f491c63 + content_hash: sha256:d9811b2c5927be74f2125444dcf36642b88ad7be422019688cb0228093dce1d0 + original_commit: 9f491c63a964c13c67ba4af3708379e1b75a92d8 + original_message: "feat(agentkit): update Agent builder and session lifecycle for v2.7" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + patch_content: |+ + From 9f491c63a964c13c67ba4af3708379e1b75a92d8 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 20 May 2026 21:00:58 -0400 + Subject: [PATCH] feat(agentkit): update Agent builder and session lifecycle + for v2.7 + + Aligns Agent and AgentSession with the generated v2.7 request shape. + MLLM sessions no longer require TTS, LLM, or STT, and enabled avatars + are rejected when MLLM is configured. AgentSession now enriches generic + and RTC avatars with session context, auto-generates avatar tokens, + validates TTS sample rates from vendor-specific fields, and adds + paginated get_turns/get_all_turns helpers with fail-fast pagination + guards. + --- + src/agora_agent/agentkit/agent.py | 164 +++++++++++++-- + src/agora_agent/agentkit/agent_session.py | 231 ++++++++++++++++++++-- + 2 files changed, 360 insertions(+), 35 deletions(-) + + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index 70a1bdd..86a958e 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -8,6 +8,24 @@ if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + +from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + +from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + +from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + +from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + +from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + +from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + +from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + +from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + +from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + +from ..agents.types.get_agents_response import GetAgentsResponse + +from ..agents.types.list_agents_response import ListAgentsResponse + +from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + +from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + +from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + +from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + +from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + +from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + +from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + @@ -46,10 +64,21 @@ from ..agents.types.start_agents_request_properties_filler_words_trigger import + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + +from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + +from ..types.tts import Tts + from .token import generate_convo_ai_token, _validate_expires_in + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + +LlmConfig = StartAgentsRequestPropertiesLlm + +LlmStyle = StartAgentsRequestPropertiesLlmStyle + +SttConfig = StartAgentsRequestPropertiesAsr + +SttVendor = StartAgentsRequestPropertiesAsrVendor + +TtsConfig = Tts + +MllmConfig = StartAgentsRequestPropertiesMllm + +MllmVendor = StartAgentsRequestPropertiesMllmVendor + +AvatarConfig = StartAgentsRequestPropertiesAvatar + +AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + @@ -93,6 +122,18 @@ InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + +AgentConfig = StartAgentsRequestProperties + +AgentConfigUpdate = UpdateAgentsRequestProperties + +SessionInfo = GetAgentsResponse + +SessionListResponse = ListAgentsResponse + +SessionSummary = ListAgentsResponseDataListItem + +ConversationHistory = GetHistoryAgentsResponse + +ConversationTurn = GetHistoryAgentsResponseContentsItem + +ConversationRole = GetHistoryAgentsResponseContentsItemRole + +ConversationTurns = GetTurnsAgentsResponse + +ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + +SpeakPriority = SpeakAgentsRequestPriority + +Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + @@ -116,6 +157,7 @@ FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + +FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + + class Agent: + @@ -183,9 +225,20 @@ class Agent: + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + + sample_rate = vendor.sample_rate + + if ( + + self._avatar_required_sample_rate not in (None, 0) + + and sample_rate is not None + + and sample_rate != self._avatar_required_sample_rate + + ): + + raise ValueError( + + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + + f"but TTS is configured with {sample_rate} Hz. " + + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + - new_agent._tts_sample_rate = vendor.sample_rate + + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + @@ -194,6 +247,9 @@ class Agent: + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + + # Note: avatars are not supported with MLLM. The combination is rejected + + # at ``to_properties`` / ``AgentSession.start`` so callers can still + + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + @@ -202,7 +258,10 @@ class Agent: + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + - advanced_features_model = new_agent._advanced_features.model_copy(update={"enable_mllm": None}) + + advanced_features_model = self._copy_model_update( + + new_agent._advanced_features, + + {"enable_mllm": None}, + + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + @@ -214,6 +273,10 @@ class Agent: + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + + # Note: avatars are not supported with MLLM. The combination is rejected + + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + + # enabled) so callers may still combine the two for testing or for the + + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + @@ -282,7 +345,10 @@ class Agent: + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + - new_agent._advanced_features = new_agent._advanced_features.model_copy(update={"enable_tools": enabled}) + + new_agent._advanced_features = self._copy_model_update( + + new_agent._advanced_features, + + {"enable_tools": enabled}, + + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + @@ -294,6 +360,23 @@ class Agent: + new_agent._parameters = parameters + return new_agent + + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + + """Returns a new Agent with the specified RTC audio scenario.""" + + new_agent = self._clone() + + if new_agent._parameters is None: + + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + + elif isinstance(new_agent._parameters, dict): + + new_agent._parameters = typing.cast( + + SessionParamsInput, + + {**new_agent._parameters, "audio_scenario": audio_scenario}, + + ) + + else: + + new_agent._parameters = self._copy_model_update( + + new_agent._parameters, + + {"audio_scenario": audio_scenario}, + + ) + + return new_agent + + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + @@ -342,6 +425,33 @@ class Agent: + new_agent._filler_words = filler_words + return new_agent + + + @staticmethod + + def _field_value(value: typing.Any, field: str) -> typing.Any: + + if value is None: + + return None + + if isinstance(value, dict): + + return value.get(field) + + return getattr(value, field, None) + + + + @staticmethod + + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + + if hasattr(value, "model_copy"): + + return value.model_copy(update=update) + + if hasattr(value, "copy"): + + return value.copy(update=update) + + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + + data_channel = self._field_value(self._parameters, "data_channel") + + if not enable_rtm or data_channel is not None: + + return self._parameters + + if self._parameters is None: + + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + + if isinstance(self._parameters, dict): + + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + + @property + def name(self) -> typing.Optional[str]: + return self._name + @@ -354,6 +464,10 @@ class Agent: + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + + @property + + def tts_sample_rate(self) -> typing.Optional[int]: + + return self._tts_sample_rate + + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + @@ -536,6 +650,20 @@ class Agent: + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + + # Validate the MLLM + enabled-avatar combination BEFORE generating the + + # RTC token so callers get a clear, actionable error first (matches the + + # TypeScript and Go SDKs' fail-fast contract). + + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + + avatar_enabled = ( + + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + + ) + + if is_mllm_mode and avatar_enabled: + + raise ValueError( + + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + + ) + + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + @@ -553,9 +681,6 @@ class Agent: + **token_kwargs, + ) + + - mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + - is_mllm_mode = bool(mllm_flag or self._mllm is not None) + - + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + @@ -579,11 +704,12 @@ class Agent: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + - if self._parameters is not None: + - if isinstance(self._parameters, dict): + - base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**self._parameters) + + parameters = self._resolved_parameters() + + if parameters is not None: + + if isinstance(parameters, dict): + + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + - base_kwargs["parameters"] = self._parameters + + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + @@ -596,12 +722,10 @@ class Agent: + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + - if self._greeting: + + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + - if self._failure_message: + + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + - if self._max_history is not None: + - mllm_config.setdefault("max_history", self._max_history) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + @@ -617,14 +741,14 @@ class Agent: + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + - if self._instructions: + + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + - if self._greeting: + - llm_config.setdefault("greeting_message", self._greeting) + - if self._failure_message: + - llm_config.setdefault("failure_message", self._failure_message) + + if self._greeting is not None: + + llm_config["greeting_message"] = self._greeting + + if self._failure_message is not None: + + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + - llm_config.setdefault("max_history", self._max_history) + + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index 2408659..e41a399 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -14,13 +14,16 @@ from ..agent_management.types.agent_think_agent_management_request_on_thinking_a + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + +from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + @@ -182,17 +185,29 @@ class _AgentSessionBase: + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + + if self._is_mllm_mode(): + + raise ValueError( + + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + - sample_rate = tts_params.get("sample_rate") if isinstance(tts_params, dict) else None + + sample_rate = self._agent.tts_sample_rate + + if sample_rate is None and isinstance(tts_params, dict): + + sample_rate = ( + + tts_params.get("sample_rate") + + or tts_params.get("sample_rate_hertz") + + or tts_params.get("samplingRate") + + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + @@ -211,6 +226,50 @@ class _AgentSessionBase: + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + + avatar = properties.get("avatar") + + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + + return + + + + params = avatar.get("params") + + if not isinstance(params, dict): + + params = {} + + avatar["params"] = params + + + + if is_generic_avatar(avatar): + + if not params.get("agora_appid"): + + params["agora_appid"] = self._app_id + + if not params.get("agora_channel"): + + params["agora_channel"] = self._channel + + + + if not is_rtc_avatar(avatar): + + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + + return + + + + if not params.get("agora_token"): + + if not self._app_certificate: + + raise ValueError( + + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + + ) + + token_kwargs: typing.Dict[str, typing.Any] = {} + + if self._expires_in is not None: + + token_kwargs["token_expire"] = self._expires_in + + params["agora_token"] = generate_convo_ai_token( + + app_id=self._app_id, + + app_certificate=self._app_certificate, + + channel_name=self._channel, + + account=str(params["agora_uid"]), + + **token_kwargs, + + ) + + + + if str(params.get("agora_uid")) == self._agent_uid: + + self._warn( + + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + + ) + + + + validate_avatar_config(avatar, require_session_fields=True) + + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + @@ -238,12 +297,17 @@ class _AgentSessionBase: + **token_opts, + ) + properties = self._dump_model(base_properties) + + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + - mllm = dict(self._agent.mllm) + - if self._agent.greeting: + + mllm = self._dump_model(self._agent.mllm) + + if not isinstance(mllm, dict): + + mllm = {} + + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + + if self._agent.failure_message is not None: + + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + @@ -251,20 +315,41 @@ class _AgentSessionBase: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + - if self._agent.instructions: + + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + - if self._agent.greeting: + - llm.setdefault("greeting_message", self._agent.greeting) + - if self._agent.failure_message: + - llm.setdefault("failure_message", self._agent.failure_message) + + if self._agent.greeting is not None: + + llm["greeting_message"] = self._agent.greeting + + if self._agent.failure_message is not None: + + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + - llm.setdefault("max_history", self._agent.max_history) + + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + + @staticmethod + + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + + if pagination is None: + + return None + + if isinstance(pagination, dict): + + return pagination.get(field) + + return getattr(pagination, field, None) + + + + @staticmethod + + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + + return list(turns or []) + + + + @classmethod + + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + + data = cls._dump_model(first_response) + + if not isinstance(data, dict): + + data = {} + + data["turns"] = turns + + return GetTurnsAgentsResponse(**data) + + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + @@ -484,7 +569,12 @@ class AgentSession(_AgentSessionBase): + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + - """Inject a custom text instruction into the current session pipeline.""" + + """Inject a custom text instruction into the current session pipeline. + + + + In API v2.7, omitting ``on_listening_action`` uses the server default + + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + + preserve the pre-v2.7 behavior. + + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + @@ -547,15 +637,68 @@ class AgentSession(_AgentSessionBase): + self._app_id, self._agent_id, request_options=self._request_options() + ) + + - def get_turns(self) -> typing.Any: + + def get_turns( + + self, + + *, + + page_index: typing.Optional[int] = None, + + page_size: typing.Optional[int] = None, + + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_turns( + - self._app_id, self._agent_id, request_options=self._request_options() + + self._app_id, + + self._agent_id, + + page_index=page_index, + + page_size=page_size, + + request_options=self._request_options(), + ) + + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + + """Get all turn analytics pages for this session. + + + + Raises ``RuntimeError`` if the server's pagination metadata is missing + + the fields required to advance, or if requesting the next page returns + + a page index that did not advance. + + """ + + response = self.get_turns(page_index=1, page_size=page_size) + + all_turns = self._response_turns(response) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + current_page = self._page_value(pagination, "page_index") or 1 + + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + + total_pages = self._page_value(pagination, "total_pages") + + returned_index = self._page_value(pagination, "page_index") + + if returned_index is None and total_pages is None: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + if total_pages is not None and current_page >= total_pages: + + break + + next_page = current_page + 1 + + response = self.get_turns(page_index=next_page, page_size=page_size) + + all_turns.extend(self._response_turns(response)) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + returned_index = self._page_value(pagination, "page_index") if pagination else None + + if returned_index is not None: + + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + + raise RuntimeError( + + f"get_all_turns pagination did not advance: requested page {next_page}, " + + f"received page {returned_index}." + + ) + + current_page = returned_index + + else: + + total_pages = self._page_value(pagination, "total_pages") if pagination else None + + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + + if total_pages is None and is_last_page is not True: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + current_page = next_page + + return self._with_all_turns(response, all_turns) + + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + @@ -734,7 +877,12 @@ class AsyncAgentSession(_AgentSessionBase): + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + - """Inject a custom text instruction into the current session pipeline.""" + + """Inject a custom text instruction into the current session pipeline. + + + + In API v2.7, omitting ``on_listening_action`` uses the server default + + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + + preserve the pre-v2.7 behavior. + + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + @@ -797,11 +945,64 @@ class AsyncAgentSession(_AgentSessionBase): + self._app_id, self._agent_id, request_options=self._request_options() + ) + + - async def get_turns(self) -> typing.Any: + + async def get_turns( + + self, + + *, + + page_index: typing.Optional[int] = None, + + page_size: typing.Optional[int] = None, + + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_turns( + - self._app_id, self._agent_id, request_options=self._request_options() + + self._app_id, + + self._agent_id, + + page_index=page_index, + + page_size=page_size, + + request_options=self._request_options(), + ) + + + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + + """Get all turn analytics pages for this session. + + + + Raises ``RuntimeError`` if the server's pagination metadata is missing + + the fields required to advance, or if requesting the next page returns + + a page index that did not advance. + + """ + + response = await self.get_turns(page_index=1, page_size=page_size) + + all_turns = self._response_turns(response) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + current_page = self._page_value(pagination, "page_index") or 1 + + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + + total_pages = self._page_value(pagination, "total_pages") + + returned_index = self._page_value(pagination, "page_index") + + if returned_index is None and total_pages is None: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + if total_pages is not None and current_page >= total_pages: + + break + + next_page = current_page + 1 + + response = await self.get_turns(page_index=next_page, page_size=page_size) + + all_turns.extend(self._response_turns(response)) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + returned_index = self._page_value(pagination, "page_index") if pagination else None + + if returned_index is not None: + + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + + raise RuntimeError( + + f"get_all_turns pagination did not advance: requested page {next_page}, " + + f"received page {returned_index}." + + ) + + current_page = returned_index + + else: + + total_pages = self._page_value(pagination, "total_pages") if pagination else None + + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + + if total_pages is None and is_last_page is not True: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + current_page = next_page + + return self._with_all_turns(response, all_turns) + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection + from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode + from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs + from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode + from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from .token import generate_convo_ai_token, _validate_expires_in + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + MllmVendor = StartAgentsRequestPropertiesMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + # LLM sub-type aliases + LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs + LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode + McpServersItem = StartAgentsRequestPropertiesLlmMcpServersItem + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Examples + -------- + >>> from agora_agent.agentkit import Agent + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + ): + self._name = name + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + The failure message is played via TTS when the LLM call fails. + """ + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Returns a new Agent with the specified maximum conversation history length.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + account=agent_uid, + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + if self._stt is not None: + base_kwargs["asr"] = self._stt + + return StartAgentsRequestProperties(**base_kwargs) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + account=self._agent_uid, + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_rtc_avatar(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + account=str(params["agora_uid"]), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + status: unresolved + - id: patch-eaec58eb + content_hash: sha256:8390ced175326080fc76021a97d315e71229bbc9ad70eef35a63eb9968df7830 + original_commit: eaec58eb2edfe03b1311a32dd137a867edf5d096 + original_message: "refactor(agentkit): align deprecated vendor aliases with canonical names" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/vendors/__init__.py + - src/agora_agent/agentkit/vendors/avatar.py + - src/agora_agent/agentkit/vendors/mllm.py + patch_content: |+ + diff --git a/src/agora_agent/agentkit/vendors/__init__.py b/src/agora_agent/agentkit/vendors/__init__.py + index 689eab1..8e2042e 100644 + --- a/src/agora_agent/agentkit/vendors/__init__.py + +++ b/src/agora_agent/agentkit/vendors/__init__.py + @@ -13,7 +13,7 @@ from .base import ( + ) + from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + -from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok, XaiRealtime + +from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok + from .stt import ( + AmazonSTT, + AresSTT, + @@ -83,7 +83,6 @@ __all__ = [ + "GeminiLive", + "VertexAI", + "XaiGrok", + - "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py + index 00cad8f..50bdd08 100644 + --- a/src/agora_agent/agentkit/vendors/avatar.py + +++ b/src/agora_agent/agentkit/vendors/avatar.py + @@ -5,19 +5,19 @@ from pydantic import BaseModel, ConfigDict, Field, field_validator + + from .base import BaseAvatar + + -HEYGEN_SAMPLE_RATE = 24000 + LIVEAVATAR_SAMPLE_RATE = 24000 + +HEYGEN_SAMPLE_RATE = LIVEAVATAR_SAMPLE_RATE + AKOOL_SAMPLE_RATE = 16000 + + + -class HeyGenAvatarOptions(BaseModel): + +class LiveAvatarAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + - api_key: str = Field(..., description="HeyGen API key") + + api_key: str = Field(..., description="LiveAvatar API key") + quality: str = Field(..., description="Avatar quality: low, medium, or high") + agora_uid: str = Field(..., description="Agora UID for the avatar stream") + agora_token: Optional[str] = Field(default=None, description="RTC token for avatar authentication") + - avatar_id: Optional[str] = Field(default=None, description="HeyGen avatar ID") + + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + disable_idle_timeout: Optional[bool] = Field(default=None, description="Whether to disable idle timeout") + activity_idle_timeout: Optional[int] = Field(default=None, description="Idle timeout in seconds") + @@ -31,20 +31,14 @@ class HeyGenAvatarOptions(BaseModel): + raise ValueError(f"Invalid quality '{v}'. Must be one of: {', '.join(valid)}") + return v + + -class HeyGenAvatar(BaseAvatar): + - """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + +class LiveAvatarAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + - warnings.warn( + - "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + - DeprecationWarning, + - stacklevel=2, + - ) + - self.options = HeyGenAvatarOptions(**kwargs) + + self.options = LiveAvatarAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + - return HEYGEN_SAMPLE_RATE + + return LIVEAVATAR_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + @@ -65,71 +59,79 @@ class HeyGenAvatar(BaseAvatar): + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + - return {"enable": enable, "vendor": "heygen", "params": params} + + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + -class AkoolAvatarOptions(BaseModel): + - model_config = ConfigDict(extra="forbid") + +class HeyGenAvatarOptions(LiveAvatarAvatarOptions): + + """Deprecated: use :class:`LiveAvatarAvatarOptions` instead.""" + + - api_key: str = Field(..., description="Akool API key") + - avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + - enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + - additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + -class AkoolAvatar(BaseAvatar): + +class HeyGenAvatar(BaseAvatar): + + """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + + def __init__(self, **kwargs: Any): + - self.options = AkoolAvatarOptions(**kwargs) + + warnings.warn( + + "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + + DeprecationWarning, + + stacklevel=2, + + ) + + self.options = HeyGenAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + - return AKOOL_SAMPLE_RATE + + return HEYGEN_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + + "quality": self.options.quality, + + "agora_uid": self.options.agora_uid, + } + + + if self.options.agora_token is not None: + + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + + if self.options.disable_idle_timeout is not None: + + params["disable_idle_timeout"] = self.options.disable_idle_timeout + + if self.options.activity_idle_timeout is not None: + + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + - return {"enable": enable, "vendor": "akool", "params": params} + + return {"enable": enable, "vendor": "heygen", "params": params} + + + -class LiveAvatarAvatarOptions(HeyGenAvatarOptions): + - pass + +class AkoolAvatarOptions(BaseModel): + + model_config = ConfigDict(extra="forbid") + + + + api_key: str = Field(..., description="Akool API key") + + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + -class LiveAvatarAvatar(BaseAvatar): + +class AkoolAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + - self.options = LiveAvatarAvatarOptions(**kwargs) + + self.options = AkoolAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + - return LIVEAVATAR_SAMPLE_RATE + + return AKOOL_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + - "quality": self.options.quality, + - "agora_uid": self.options.agora_uid, + } + + - if self.options.agora_token is not None: + - params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + - if self.options.disable_idle_timeout is not None: + - params["disable_idle_timeout"] = self.options.disable_idle_timeout + - if self.options.activity_idle_timeout is not None: + - params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + - return {"enable": enable, "vendor": "liveavatar", "params": params} + + return {"enable": enable, "vendor": "akool", "params": params} + + + class GenericAvatarOptions(BaseModel): + @@ -145,6 +147,7 @@ class GenericAvatarOptions(BaseModel): + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + + class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + @@ -178,10 +181,11 @@ class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anam API key") + - persona_id: Optional[str] = Field(default=None, description="Persona ID") + + persona_id: Optional[str] = Field(default=None, description="Anam persona ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + + class AnamAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AnamAvatarOptions(**kwargs) + diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py + index cd6cd07..b58f040 100644 + --- a/src/agora_agent/agentkit/vendors/mllm.py + +++ b/src/agora_agent/agentkit/vendors/mllm.py + @@ -1,4 +1,3 @@ + -import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + @@ -119,30 +118,6 @@ class XaiGrok(BaseMLLM): + return config + + + -class XaiRealtimeOptions(XaiGrokOptions): + - """Deprecated: use :class:`XaiGrokOptions` instead.""" + - + - def __init__(self, **data: Any): + - warnings.warn( + - "XaiRealtimeOptions is deprecated; use XaiGrokOptions instead.", + - DeprecationWarning, + - stacklevel=2, + - ) + - super().__init__(**data) + - + - + -class XaiRealtime(XaiGrok): + - """Deprecated: use :class:`XaiGrok` instead.""" + - + - def __init__(self, **kwargs: Any): + - warnings.warn( + - "XaiRealtime is deprecated; use XaiGrok instead.", + - DeprecationWarning, + - stacklevel=2, + - ) + - super().__init__(**kwargs) + - + - + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + theirs_snapshot: + src/agora_agent/agentkit/vendors/__init__.py: | + from .base import ( + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + ElevenLabsSampleRate, + GoogleTTSSampleRate, + MicrosoftSampleRate, + OpenAISampleRate, + SampleRate, + ) + from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok + from .stt import ( + AmazonSTT, + AresSTT, + AssemblyAISTT, + DeepgramSTT, + GoogleSTT, + MicrosoftSTT, + OpenAISTT, + SarvamSTT, + SpeechmaticsSTT, + ) + from .tts import ( + AmazonTTS, + CartesiaTTS, + DeepgramTTS, + ElevenLabsTTS, + FishAudioTTS, + GoogleTTS, + HumeAITTS, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAITTS, + RimeTTS, + SarvamTTS, + ) + + __all__ = [ + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "GoogleTTSSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + ] + src/agora_agent/agentkit/vendors/avatar.py: | + import warnings + from typing import Any, Dict, Optional + + from pydantic import BaseModel, ConfigDict, Field, field_validator + + from .base import BaseAvatar + + LIVEAVATAR_SAMPLE_RATE = 24000 + HEYGEN_SAMPLE_RATE = LIVEAVATAR_SAMPLE_RATE + AKOOL_SAMPLE_RATE = 16000 + + + class LiveAvatarAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="LiveAvatar API key") + quality: str = Field(..., description="Avatar quality: low, medium, or high") + agora_uid: str = Field(..., description="Agora UID for the avatar stream") + agora_token: Optional[str] = Field(default=None, description="RTC token for avatar authentication") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + disable_idle_timeout: Optional[bool] = Field(default=None, description="Whether to disable idle timeout") + activity_idle_timeout: Optional[int] = Field(default=None, description="Idle timeout in seconds") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + @field_validator("quality") + @classmethod + def validate_quality(cls, v: str) -> str: + valid = ("low", "medium", "high") + if v not in valid: + raise ValueError(f"Invalid quality '{v}'. Must be one of: {', '.join(valid)}") + return v + + + class LiveAvatarAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = LiveAvatarAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return LIVEAVATAR_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + class HeyGenAvatarOptions(LiveAvatarAvatarOptions): + """Deprecated: use :class:`LiveAvatarAvatarOptions` instead.""" + + + class HeyGenAvatar(BaseAvatar): + """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + def __init__(self, **kwargs: Any): + warnings.warn( + "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + DeprecationWarning, + stacklevel=2, + ) + self.options = HeyGenAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return HEYGEN_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "heygen", "params": params} + + + class AkoolAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Akool API key") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + class AkoolAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AkoolAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return AKOOL_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "akool", "params": params} + + + class GenericAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Generic avatar provider API key") + api_base_url: str = Field(..., description="Avatar provider API base URL") + avatar_id: str = Field(..., description="Avatar ID") + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "api_base_url": self.options.api_base_url, + "avatar_id": self.options.avatar_id, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_appid is not None: + params["agora_appid"] = self.options.agora_appid + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.agora_channel is not None: + params["agora_channel"] = self.options.agora_channel + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "generic", "params": params} + + + class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anam API key") + persona_id: Optional[str] = Field(default=None, description="Anam persona ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + class AnamAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AnamAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.persona_id is not None: + params["persona_id"] = self.options.persona_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "anam", "params": params} + src/agora_agent/agentkit/vendors/mllm.py: | + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + + from ...agents.types.start_agents_request_properties_mllm_turn_detection import ( + StartAgentsRequestPropertiesMllmTurnDetection, + ) + from .base import BaseMLLM + + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + + + class OpenAIRealtimeOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="OpenAI API key") + model: Optional[str] = Field(default=None, description="Model name (e.g., gpt-4o-realtime-preview)") + url: Optional[str] = Field(default=None, description="WebSocket URL") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIRealtimeOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config: Dict[str, Any] = { + "vendor": "openai", + "api_key": self.options.api_key, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.model is not None: + params = {"model": self.options.model} + if self.options.params is not None: + params.update(self.options.params) + config["params"] = params + elif self.options.params is not None: + config["params"] = self.options.params + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + # xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + # is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + class XaiGrokOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="xAI API key") + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + class XaiGrok(BaseMLLM): + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + def __init__(self, **kwargs: Any): + self.options = XaiGrokOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.language is not None: + params["language"] = self.options.language + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate + + config: Dict[str, Any] = { + "vendor": "xai", + "api_key": self.options.api_key, + "url": self.options.url, + "params": params, + } + + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + model: str = Field(..., description="Model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud location/region") + adc_credentials_string: str = Field(..., description="Application Default Credentials JSON string") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name (e.g., Aoede, Charon)") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # additional_params spread first so that explicit fields always win, + # matching the TypeScript SDK. + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["model"] = self.options.model + params["project_id"] = self.options.project_id + params["location"] = self.options.location + params["adc_credentials_string"] = self.options.adc_credentials_string + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "vertexai", + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class GeminiLiveOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google API key") + model: str = Field(..., description="Gemini Live model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiLiveOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = {} + if self.options.additional_params is not None: + params.update(self.options.additional_params) + params["model"] = self.options.model + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "gemini", + "api_key": self.options.api_key, + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + status: unresolved + - id: patch-20245632 + content_hash: sha256:a22e4a3b114ba8105c8129ccd6222570dc1f231daf9ac6037a00bcd4e11c425b + original_commit: 20245632afd066efe5a453665b29c5ba0e13e4f8 + original_message: "feat(agentkit): export type aliases and avatar token helpers" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/__init__.py + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + - src/agora_agent/agentkit/avatar_types.py + - src/agora_agent/agentkit/constants.py + patch_content: |+ + From 20245632afd066efe5a453665b29c5ba0e13e4f8 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Thu, 21 May 2026 15:17:27 -0400 + Subject: [PATCH] feat(agentkit): export type aliases and avatar token helpers + + --- + src/agora_agent/agentkit/__init__.py | 49 ++++++++++++++++------- + src/agora_agent/agentkit/agent.py | 22 +++++++++- + src/agora_agent/agentkit/agent_session.py | 8 +++- + src/agora_agent/agentkit/avatar_types.py | 23 +++++++++-- + src/agora_agent/agentkit/constants.py | 10 +++++ + 5 files changed, 90 insertions(+), 22 deletions(-) + + diff --git a/src/agora_agent/agentkit/__init__.py b/src/agora_agent/agentkit/__init__.py + index 5ceda66..e9ab221 100644 + --- a/src/agora_agent/agentkit/__init__.py + +++ b/src/agora_agent/agentkit/__init__.py + @@ -2,6 +2,7 @@ from .agent import ( + Agent, + AgentConfig, + AgentConfigUpdate, + + AsrConfig, + ConversationHistory, + ConversationRole, + ConversationSessionTurn, + @@ -62,23 +63,23 @@ from .agent import ( + SessionListResponse, + SessionSummary, + SpeakPriority, + + ThinkOnListeningAction, + + ThinkOnSpeakingAction, + + ThinkOnThinkingAction, + + ThinkResponse, + ) + -from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + -from ..agent_management.types.agent_think_agent_management_response import ( + - AgentThinkAgentManagementResponse as AgentThinkResponse, + -) + -from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + - AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + -) + -from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + - AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + -) + -from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + - AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + +# Deprecated think type aliases (prefer ThinkOn* names). + +from .agent import ( + + ThinkOnListeningAction as AgentThinkRequestOnListeningAction, + + ThinkOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + + ThinkOnThinkingAction as AgentThinkRequestOnThinkingAction, + + ThinkResponse as AgentThinkResponse, + ) + +from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + @@ -94,6 +95,13 @@ from .constants import ( + GeofenceArea, + GeofenceExcludeArea, + FillerWordsSelectionRule, + + ThinkOnListeningActionIgnore, + + ThinkOnListeningActionInject, + + ThinkOnListeningActionInterrupt, + + ThinkOnSpeakingActionIgnore, + + ThinkOnSpeakingActionInterrupt, + + ThinkOnThinkingActionIgnore, + + ThinkOnThinkingActionInterrupt, + TurnDetectionTypeValues, + ) + from .token import ( + @@ -158,7 +166,6 @@ from .vendors import ( + SpeechmaticsSTT, + VertexAI, + XaiGrok, + - XaiRealtime, + LiveAvatarAvatar, + ) + + @@ -172,6 +179,7 @@ __all__ = [ + "LlmConfig", + "LlmStyle", + "SttConfig", + + "AsrConfig", + "SttVendor", + "TtsConfig", + "MllmConfig", + @@ -230,6 +238,13 @@ __all__ = [ + "GeofenceExcludeArea", + "FillerWordsSelectionRule", + "TurnDetectionTypeValues", + + "ThinkOnListeningActionInject", + + "ThinkOnListeningActionInterrupt", + + "ThinkOnListeningActionIgnore", + + "ThinkOnThinkingActionInterrupt", + + "ThinkOnThinkingActionIgnore", + + "ThinkOnSpeakingActionInterrupt", + + "ThinkOnSpeakingActionIgnore", + # LLM sub-types + "LlmGreetingConfigs", + "LlmGreetingConfigsMode", + @@ -246,10 +261,16 @@ __all__ = [ + "ConversationTurns", + "ConversationSessionTurn", + "SpeakPriority", + + "ThinkResponse", + + "ThinkOnListeningAction", + + "ThinkOnThinkingAction", + + "ThinkOnSpeakingAction", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + "AgentThinkRequestOnSpeakingAction", + + "is_avatar_token_managed", + + "is_rtc_avatar", + "AgentPresets", + "DeepgramPresetModels", + "OpenAIPresetModels", + @@ -303,7 +324,6 @@ __all__ = [ + "GeminiLive", + "VertexAI", + "XaiGrok", + - "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + @@ -314,7 +334,6 @@ __all__ = [ + "is_akool_avatar", + "is_anam_avatar", + "is_generic_avatar", + - "is_rtc_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index 86a958e..14933a2 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -66,13 +66,25 @@ from ..agents.types.start_agents_request_properties_filler_words_content import + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + -from .token import generate_convo_ai_token, _validate_expires_in + +from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + + AgentThinkAgentManagementRequestOnListeningAction, + +) + +from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + + AgentThinkAgentManagementRequestOnThinkingAction, + +) + +from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + + AgentThinkAgentManagementRequestOnSpeakingAction, + +) + +from ..agent_management.types.agent_think_agent_management_response import ( + + AgentThinkAgentManagementResponse, + +) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + +AsrConfig = SttConfig + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + @@ -159,6 +171,14 @@ FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + +# Think type aliases and response + +ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + +ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + +ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + +ThinkResponse = AgentThinkAgentManagementResponse + + + +from .token import generate_convo_ai_token, _validate_expires_in + + + + class Agent: + """A reusable agent definition. + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index e41a399..269619e 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -20,10 +20,10 @@ from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + - is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + @@ -242,7 +242,11 @@ class _AgentSessionBase: + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + - if not is_rtc_avatar(avatar): + + if not is_avatar_token_managed(avatar): + + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + + return + + + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + diff --git a/src/agora_agent/agentkit/avatar_types.py b/src/agora_agent/agentkit/avatar_types.py + index a04809c..aea9da1 100644 + --- a/src/agora_agent/agentkit/avatar_types.py + +++ b/src/agora_agent/agentkit/avatar_types.py + @@ -1,3 +1,4 @@ + +import warnings + import typing + + + @@ -21,11 +22,25 @@ def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "generic" + + + +def is_avatar_token_managed(config: typing.Dict[str, typing.Any]) -> bool: + + """Return True when AgentKit manages the avatar RTC publisher identity.""" + + return ( + + is_heygen_avatar(config) + + or is_live_avatar_avatar(config) + + or is_generic_avatar(config) + + ) + + + + + def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + - params = config.get("params", {}) + - return isinstance(params, dict) and bool(params.get("agora_uid")) and ( + - is_heygen_avatar(config) or is_live_avatar_avatar(config) or is_generic_avatar(config) + + """Deprecated: use :func:`is_avatar_token_managed` for vendor gating.""" + + warnings.warn( + + "is_rtc_avatar is deprecated; use is_avatar_token_managed for vendor gating " + + "and keep agora_uid checks in session enrichment.", + + DeprecationWarning, + + stacklevel=2, + ) + + params = config.get("params", {}) + + return isinstance(params, dict) and bool(params.get("agora_uid")) and is_avatar_token_managed(config) + + + def validate_avatar_config( + @@ -95,7 +110,7 @@ def validate_tts_sample_rate( + """Validates that TTS sample rate is compatible with the avatar vendor. + + Different avatar vendors have specific sample rate requirements: + - - HeyGen: ONLY supports 24,000 Hz + + - HeyGen/LiveAvatar: ONLY supports 24,000 Hz + - Akool: ONLY supports 16,000 Hz + + Parameters + diff --git a/src/agora_agent/agentkit/constants.py b/src/agora_agent/agentkit/constants.py + index f86e4d3..c0a852e 100644 + --- a/src/agora_agent/agentkit/constants.py + +++ b/src/agora_agent/agentkit/constants.py + @@ -58,3 +58,13 @@ class TurnDetectionTypeValues: + AGORA_VAD = "agora_vad" + SERVER_VAD = "server_vad" + SEMANTIC_VAD = "semantic_vad" + + + + + +# Think action value constants (match Fern wire values) + +ThinkOnListeningActionInject = "inject" + +ThinkOnListeningActionInterrupt = "interrupt" + +ThinkOnListeningActionIgnore = "ignore" + +ThinkOnThinkingActionInterrupt = "interrupt" + +ThinkOnThinkingActionIgnore = "ignore" + +ThinkOnSpeakingActionInterrupt = "interrupt" + +ThinkOnSpeakingActionIgnore = "ignore" + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/__init__.py: | + from .agent import ( + Agent, + AgentConfig, + AgentConfigUpdate, + AsrConfig, + ConversationHistory, + ConversationRole, + ConversationSessionTurn, + ConversationTurn, + ConversationTurns, + StartAgentsRequestProperties, + AvatarConfig, + AvatarVendor, + GeofenceConfig, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, + RtcConfig, + SttConfig, + SttVendor, + TtsConfig, + FillerWordsConfig, + FillerWordsTrigger, + FillerWordsTriggerFixedTimeConfig, + FillerWordsContent, + FillerWordsContentStaticConfig, + FillerWordsContentSelectionRule, + TurnDetectionConfig, + TurnDetectionNestedConfig, + StartOfSpeechConfig, + StartOfSpeechMode, + StartOfSpeechVadConfig, + StartOfSpeechKeywordsConfig, + StartOfSpeechDisabledConfig, + StartOfSpeechDisabledConfigStrategy, + EndOfSpeechConfig, + EndOfSpeechMode, + EndOfSpeechVadConfig, + EndOfSpeechSemanticConfig, + TurnDetectionType, + InterruptMode, + Eagerness, + SalConfig, + SalMode, + AdvancedFeatures, + SessionParams, + SessionParamsInput, + SilenceConfig, + SilenceAction, + FarewellConfig, + ParametersDataChannel, + ParametersAudioScenario, + InterruptionConfig, + InterruptionMode, + MllmTurnDetectionConfig, + MllmTurnDetectionMode, + Labels, + LlmGreetingConfigs, + LlmGreetingConfigsMode, + McpServersItem, + SessionInfo, + SessionListResponse, + SessionSummary, + SpeakPriority, + ThinkOnListeningAction, + ThinkOnSpeakingAction, + ThinkOnThinkingAction, + ThinkResponse, + ) + # Deprecated think type aliases (prefer ThinkOn* names). + from .agent import ( + ThinkOnListeningAction as AgentThinkRequestOnListeningAction, + ThinkOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ThinkOnThinkingAction as AgentThinkRequestOnThinkingAction, + ThinkResponse as AgentThinkResponse, + ) + from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .constants import ( + DataChannel, + AudioScenario, + SilenceActionValues, + SalModeValues, + GeofenceArea, + GeofenceExcludeArea, + FillerWordsSelectionRule, + ThinkOnListeningActionIgnore, + ThinkOnListeningActionInject, + ThinkOnListeningActionInterrupt, + ThinkOnSpeakingActionIgnore, + ThinkOnSpeakingActionInterrupt, + ThinkOnThinkingActionIgnore, + ThinkOnThinkingActionInterrupt, + TurnDetectionTypeValues, + ) + from .token import ( + GenerateConvoAITokenOptions, + GenerateTokenOptions, + MAX_EXPIRY_SECONDS, + generate_convo_ai_token, + generate_rtc_token, + expires_in_hours, + expires_in_minutes, + ) + from .presets import ( + AgentPresets, + DeepgramPresetModels, + MiniMaxPresetModels, + OpenAIPresetModels, + OpenAITtsPresetModels, + normalize_preset_input, + ) + from .vendors import ( + AkoolAvatar, + AmazonSTT, + AmazonTTS, + AnamAvatar, + Anthropic, + AresSTT, + AssemblyAISTT, + AzureOpenAI, + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + CartesiaTTS, + DeepgramSTT, + DeepgramTTS, + ElevenLabsSampleRate, + ElevenLabsTTS, + FishAudioTTS, + Gemini, + GeminiLive, + GenericAvatar, + GoogleSTT, + GoogleTTS, + HeyGenAvatar, + HumeAITTS, + MicrosoftSampleRate, + MicrosoftSTT, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAI, + OpenAIRealtime, + OpenAISampleRate, + OpenAISTT, + OpenAITTS, + RimeTTS, + SampleRate, + SarvamSTT, + SarvamTTS, + SpeechmaticsSTT, + VertexAI, + XaiGrok, + LiveAvatarAvatar, + ) + + __all__ = [ + "Agent", + "AgentConfig", + "AgentConfigUpdate", + # Return type of Agent.to_properties() + "StartAgentsRequestProperties", + # Top-level config types + "LlmConfig", + "LlmStyle", + "SttConfig", + "AsrConfig", + "SttVendor", + "TtsConfig", + "MllmConfig", + "MllmVendor", + "AvatarConfig", + "AvatarVendor", + "GeofenceConfig", + "RtcConfig", + "FillerWordsConfig", + "FillerWordsTrigger", + "FillerWordsTriggerFixedTimeConfig", + "FillerWordsContent", + "FillerWordsContentStaticConfig", + "FillerWordsContentSelectionRule", + # Turn detection types + "TurnDetectionConfig", + "TurnDetectionNestedConfig", + "StartOfSpeechConfig", + "StartOfSpeechMode", + "StartOfSpeechVadConfig", + "StartOfSpeechKeywordsConfig", + "StartOfSpeechDisabledConfig", + "StartOfSpeechDisabledConfigStrategy", + "EndOfSpeechConfig", + "EndOfSpeechMode", + "EndOfSpeechVadConfig", + "EndOfSpeechSemanticConfig", + # Deprecated turn detection types + "TurnDetectionType", + "InterruptMode", + "Eagerness", + # SAL types + "SalConfig", + "SalMode", + # Advanced features + "AdvancedFeatures", + # Session parameters types + "SessionParams", + "SessionParamsInput", + "SilenceConfig", + "SilenceAction", + "FarewellConfig", + "ParametersDataChannel", + "ParametersAudioScenario", + "InterruptionConfig", + "InterruptionMode", + "MllmTurnDetectionConfig", + "MllmTurnDetectionMode", + "Labels", + # Type-safe constants + "DataChannel", + "AudioScenario", + "SilenceActionValues", + "SalModeValues", + "GeofenceArea", + "GeofenceExcludeArea", + "FillerWordsSelectionRule", + "TurnDetectionTypeValues", + "ThinkOnListeningActionInject", + "ThinkOnListeningActionInterrupt", + "ThinkOnListeningActionIgnore", + "ThinkOnThinkingActionInterrupt", + "ThinkOnThinkingActionIgnore", + "ThinkOnSpeakingActionInterrupt", + "ThinkOnSpeakingActionIgnore", + # LLM sub-types + "LlmGreetingConfigs", + "LlmGreetingConfigsMode", + "McpServersItem", + "AgentSession", + "AsyncAgentSession", + "AgentSessionOptions", + "SessionInfo", + "SessionListResponse", + "SessionSummary", + "ConversationHistory", + "ConversationTurn", + "ConversationRole", + "ConversationTurns", + "ConversationSessionTurn", + "SpeakPriority", + "ThinkResponse", + "ThinkOnListeningAction", + "ThinkOnThinkingAction", + "ThinkOnSpeakingAction", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + "AgentThinkRequestOnSpeakingAction", + "is_avatar_token_managed", + "is_rtc_avatar", + "AgentPresets", + "DeepgramPresetModels", + "OpenAIPresetModels", + "OpenAITtsPresetModels", + "MiniMaxPresetModels", + "normalize_preset_input", + "generate_rtc_token", + "GenerateTokenOptions", + "generate_convo_ai_token", + "GenerateConvoAITokenOptions", + "MAX_EXPIRY_SECONDS", + "expires_in_hours", + "expires_in_minutes", + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + "is_heygen_avatar", + "is_live_avatar_avatar", + "is_akool_avatar", + "is_anam_avatar", + "is_generic_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection + from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode + from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs + from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode + from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse, + ) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + AsrConfig = SttConfig + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + MllmVendor = StartAgentsRequestPropertiesMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + # LLM sub-type aliases + LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs + LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode + McpServersItem = StartAgentsRequestPropertiesLlmMcpServersItem + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + # Think type aliases and response + ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + ThinkResponse = AgentThinkAgentManagementResponse + + from .token import generate_convo_ai_token, _validate_expires_in + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Examples + -------- + >>> from agora_agent.agentkit import Agent + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + ): + self._name = name + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + The failure message is played via TTS when the LLM call fails. + """ + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Returns a new Agent with the specified maximum conversation history length.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + account=agent_uid, + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + if self._stt is not None: + base_kwargs["asr"] = self._stt + + return StartAgentsRequestProperties(**base_kwargs) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + account=self._agent_uid, + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + account=str(params["agora_uid"]), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + src/agora_agent/agentkit/avatar_types.py: | + import warnings + import typing + + + def is_heygen_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "heygen" + + + def is_live_avatar_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "liveavatar" + + + def is_akool_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "akool" + + + def is_anam_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "anam" + + + def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "generic" + + + def is_avatar_token_managed(config: typing.Dict[str, typing.Any]) -> bool: + """Return True when AgentKit manages the avatar RTC publisher identity.""" + return ( + is_heygen_avatar(config) + or is_live_avatar_avatar(config) + or is_generic_avatar(config) + ) + + + def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + """Deprecated: use :func:`is_avatar_token_managed` for vendor gating.""" + warnings.warn( + "is_rtc_avatar is deprecated; use is_avatar_token_managed for vendor gating " + "and keep agora_uid checks in session enrichment.", + DeprecationWarning, + stacklevel=2, + ) + params = config.get("params", {}) + return isinstance(params, dict) and bool(params.get("agora_uid")) and is_avatar_token_managed(config) + + + def validate_avatar_config( + config: typing.Dict[str, typing.Any], + require_session_fields: bool = False, + ) -> None: + """Validates avatar configuration at runtime. + + Parameters + ---------- + config : dict + The avatar configuration dictionary. + + Raises + ------ + ValueError + If the configuration is invalid. + """ + if is_heygen_avatar(config) or is_live_avatar_avatar(config): + label = "HeyGen" if is_heygen_avatar(config) else "LiveAvatar" + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError(f"{label} avatar requires api_key") + if not params.get("quality"): + raise ValueError(f"{label} avatar requires quality (low, medium, or high)") + if not params.get("agora_uid"): + raise ValueError(f"{label} avatar requires agora_uid") + valid_qualities = ("low", "medium", "high") + if params.get("quality") not in valid_qualities: + raise ValueError( + f"Invalid quality for {label}: {params.get('quality')}. " + f"Must be one of: {', '.join(valid_qualities)}" + ) + if require_session_fields and not params.get("agora_token"): + raise ValueError(f"{label} avatar requires agora_token after session enrichment") + elif is_akool_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Akool avatar requires api_key") + elif is_anam_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Anam avatar requires api_key") + elif is_generic_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Generic avatar requires api_key") + if not params.get("api_base_url"): + raise ValueError("Generic avatar requires api_base_url") + if not params.get("avatar_id"): + raise ValueError("Generic avatar requires avatar_id") + if not params.get("agora_uid"): + raise ValueError("Generic avatar requires agora_uid") + if require_session_fields: + if not params.get("agora_token"): + raise ValueError("Generic avatar requires agora_token after session enrichment") + if not params.get("agora_appid"): + raise ValueError("Generic avatar requires agora_appid after session enrichment") + if not params.get("agora_channel"): + raise ValueError("Generic avatar requires agora_channel after session enrichment") + + + def validate_tts_sample_rate( + avatar_config: typing.Dict[str, typing.Any], + tts_sample_rate: int, + ) -> None: + """Validates that TTS sample rate is compatible with the avatar vendor. + + Different avatar vendors have specific sample rate requirements: + - HeyGen/LiveAvatar: ONLY supports 24,000 Hz + - Akool: ONLY supports 16,000 Hz + + Parameters + ---------- + avatar_config : dict + The avatar configuration dictionary. + tts_sample_rate : int + The sample rate from your TTS configuration (in Hz). + + Raises + ------ + ValueError + If TTS sample rate is incompatible with the avatar vendor. + """ + if is_heygen_avatar(avatar_config) or is_live_avatar_avatar(avatar_config): + if tts_sample_rate != 24000: + label = "HeyGen" if is_heygen_avatar(avatar_config) else "LiveAvatar" + raise ValueError( + f"{label} avatars ONLY support 24,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 24kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/overview" + ) + elif is_akool_avatar(avatar_config): + if tts_sample_rate != 16000: + raise ValueError( + f"Akool avatars ONLY support 16,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 16kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/akool" + ) + src/agora_agent/agentkit/constants.py: | + """ + Type-safe constants for agent configuration values. + Use these instead of raw strings to avoid typos and get IDE autocomplete. + """ + + # Data channel: "rtm" | "datastream" + class DataChannel: + RTM = "rtm" + DATASTREAM = "datastream" + + class AudioScenario: + DEFAULT = "default" + CHORUS = "chorus" + AISERVER = "aiserver" + + + # Silence action when timeout elapses: "speak" | "think" + # (Use for parameters.silence_config.action — avoids shadowing SilenceAction type) + class SilenceActionValues: + SPEAK = "speak" + THINK = "think" + + + # SAL mode: "locking" | "recognition" + # (Use for sal.sal_mode — avoids shadowing SalMode type) + class SalModeValues: + LOCKING = "locking" + RECOGNITION = "recognition" + + + # Geofence area: "GLOBAL" | "NORTH_AMERICA" | "EUROPE" | "ASIA" | "INDIA" | "JAPAN" + class GeofenceArea: + GLOBAL = "GLOBAL" + NORTH_AMERICA = "NORTH_AMERICA" + EUROPE = "EUROPE" + ASIA = "ASIA" + INDIA = "INDIA" + JAPAN = "JAPAN" + + + # Geofence exclude area (when area is GLOBAL) + class GeofenceExcludeArea: + NORTH_AMERICA = "NORTH_AMERICA" + EUROPE = "EUROPE" + ASIA = "ASIA" + INDIA = "INDIA" + JAPAN = "JAPAN" + + + # Filler word selection rule: "shuffle" | "round_robin" + class FillerWordsSelectionRule: + SHUFFLE = "shuffle" + ROUND_ROBIN = "round_robin" + + + # Turn detection type (deprecated; use TurnDetectionNestedConfig.EndOfSpeech instead) + class TurnDetectionTypeValues: + AGORA_VAD = "agora_vad" + SERVER_VAD = "server_vad" + SEMANTIC_VAD = "semantic_vad" + + + # Think action value constants (match Fern wire values) + ThinkOnListeningActionInject = "inject" + ThinkOnListeningActionInterrupt = "interrupt" + ThinkOnListeningActionIgnore = "ignore" + ThinkOnThinkingActionInterrupt = "interrupt" + ThinkOnThinkingActionIgnore = "ignore" + ThinkOnSpeakingActionInterrupt = "interrupt" + ThinkOnSpeakingActionIgnore = "ignore" + status: unresolved + - id: patch-972dd5bd + content_hash: sha256:10f86db20e0b5a3800efce4913b736ff338dee29eb18cb31e89658e0293b848e + original_commit: 972dd5bdafc09b3981ab2ce4e0d02beae165c626 + original_message: updated docs + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/reference/agent.md + patch_content: |+ + From 972dd5bdafc09b3981ab2ce4e0d02beae165c626 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Thu, 21 May 2026 16:13:35 -0400 + Subject: [PATCH] updated docs + + --- + docs/reference/agent.md | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + + diff --git a/docs/reference/agent.md b/docs/reference/agent.md + index 1e88b8b..3163f9c 100644 + --- a/docs/reference/agent.md + +++ b/docs/reference/agent.md + @@ -264,3 +264,18 @@ to_properties( + | `rtc` | `Optional[RtcConfig]` | RTC configuration | + | `filler_words` | `Optional[FillerWordsConfig]` | Filler words configuration | + | `config` | `Dict[str, Any]` | Full configuration dict | + + + +## Type aliases + + + +Public aliases over Fern-generated types: `LlmConfig`, `SttConfig`, `AsrConfig` (= `SttConfig`), `MllmConfig`, `AvatarConfig`, session/conversation types, and think types (`ThinkOnListeningAction`, etc.). + + + +Think value constants: `ThinkOnListeningActionInject`, `ThinkOnListeningActionInterrupt`, `ThinkOnListeningActionIgnore`, `ThinkOnThinkingActionInterrupt`, `ThinkOnThinkingActionIgnore`, `ThinkOnSpeakingActionInterrupt`, `ThinkOnSpeakingActionIgnore`. + + + +## Cross-SDK discovery map + + + +| Concept | Python | TypeScript | Go | + +|---|---|---|---| + +| STT payload alias (wire: `asr`) | `SttConfig` / `AsrConfig` | `SttConfig` / `AsrConfig` | `AsrConfig` / `SttConfig` | + +| xAI MLLM (primary) | `XaiGrok` | `XaiGrok` | `XaiGrok` / `NewXaiGrok` | + +| Avatar token helper | `is_avatar_token_managed` | `isAvatarTokenManaged` | `IsAvatarTokenManaged` | + +| Think inject constant | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | + -- + 2.52.0 + + theirs_snapshot: + docs/reference/agent.md: | + --- + sidebar_position: 2 + title: Agent + description: Full API reference for the Python Agent builder class. + --- + + # Agent Reference + + **Import:** `from agora_agent.agentkit import Agent` or `from agora_agent import Agent` + + ## Constructor + + + ```python + Agent( + name: Optional[str] = None, + instructions: Optional[str] = None, + turn_detection: Optional[TurnDetectionConfig] = None, + interruption: Optional[InterruptionConfig] = None, + sal: Optional[SalConfig] = None, + advanced_features: Optional[Dict[str, Any]] = None, + parameters: Optional[SessionParams] = None, + greeting: Optional[str] = None, + failure_message: Optional[str] = None, + max_history: Optional[int] = None, + geofence: Optional[GeofenceConfig] = None, + labels: Optional[Dict[str, str]] = None, + rtc: Optional[RtcConfig] = None, + filler_words: Optional[FillerWordsConfig] = None, + ) + ``` + + | Parameter | Type | Default | Description | + |---|---|---|---| + | `name` | `Optional[str]` | `None` | Agent name, used as default session name | + | `instructions` | `Optional[str]` | `None` | System prompt for the LLM | + | `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Turn detection configuration | + | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | + | `sal` | `Optional[SalConfig]` | `None` | Speech Activity Level configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | `None` | Advanced features dict (e.g., `{'enable_rtm': True}`) | + | `parameters` | `Optional[SessionParams]` | `None` | Additional session parameters | + | `greeting` | `Optional[str]` | `None` | Auto-spoken greeting when agent joins | + | `failure_message` | `Optional[str]` | `None` | Spoken on error | + | `max_history` | `Optional[int]` | `None` | Max conversation history length | + | `geofence` | `Optional[GeofenceConfig]` | `None` | Regional access restriction | + | `labels` | `Optional[Dict[str, str]]` | `None` | Custom key-value labels (returned in callbacks) | + | `rtc` | `Optional[RtcConfig]` | `None` | RTC media encryption | + | `filler_words` | `Optional[FillerWordsConfig]` | `None` | Filler words while waiting for LLM | + + ## Builder Methods + + All builder methods return a new `Agent` instance (immutable pattern). + + ### `with_llm(vendor: BaseLLM) -> Agent` + + Set the LLM vendor for cascading flow. + + + ```python + from agora_agent.agentkit.vendors import OpenAI + agent = Agent().with_llm(OpenAI(api_key='your-key', model='gpt-4o-mini')) + ``` + + ### `with_tts(vendor: BaseTTS) -> Agent` + + Set the TTS vendor. Records the vendor's `sample_rate` for avatar validation. + + + ```python + from agora_agent.agentkit.vendors import ElevenLabsTTS + agent = Agent().with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id')) + ``` + + ### `with_stt(vendor: BaseSTT) -> Agent` + + Set the STT (ASR) vendor. + + + ```python + from agora_agent.agentkit.vendors import DeepgramSTT + agent = Agent().with_stt(DeepgramSTT(api_key='your-key', language='en-US')) + ``` + + ### `with_mllm(vendor: BaseMLLM) -> Agent` + + Set the MLLM vendor for multimodal flow. Calling `with_mllm()` automatically sets `mllm.enable = True`. MLLM sessions do not require TTS, STT, or LLM vendors. + + + ```python + from agora_agent.agentkit.vendors import OpenAIRealtime + agent = Agent().with_mllm(OpenAIRealtime(api_key='your-key')) + ``` + + ### `with_avatar(vendor: BaseAvatar) -> Agent` + + Set the avatar vendor for the cascading ASR + LLM + TTS pipeline. Avatars are not supported when MLLM is enabled — combining `with_mllm()` and an enabled `with_avatar()` is rejected at `to_properties()` and `AgentSession.start()`. A disabled avatar (`enable=False`) is allowed alongside MLLM. + + Raises `ValueError` if the TTS sample rate does not match the avatar's `required_sample_rate`. + + + ```python + from agora_agent.agentkit.vendors import HeyGenAvatar + agent = agent.with_avatar(HeyGenAvatar(api_key='your-key', quality='medium', agora_uid='2')) + ``` + + **Raises:** `ValueError` — `"Avatar requires TTS sample rate of {required} Hz, but TTS is configured with {actual} Hz. Please update your TTS sample_rate to {required}."` + + ### `with_turn_detection(config: TurnDetectionConfig) -> Agent` + + Override cascading-flow turn detection settings. Use `config.start_of_speech` and `config.end_of_speech` for SOS/EOS detection. Use `with_interruption()` for interruption behavior and MLLM vendor `turn_detection` for MLLM turn detection. + + Pause-state detection is configured under semantic end-of-speech: + + ```python + agent = agent.with_turn_detection({ + "mode": "default", + "config": { + "end_of_speech": { + "mode": "semantic", + "semantic_config": { + "pause_state_enabled": True, + }, + }, + }, + }) + ``` + + ### `with_interruption(config: InterruptionConfig) -> Agent` + + Configure unified interruption behavior using the top-level `interruption` object. Use this for `start_of_speech` and `keywords` interruption modes. + + ### `with_instructions(instructions: str) -> Agent` + + Override the system prompt. + + ### `with_greeting(greeting: str) -> Agent` + + Override the greeting message. + + ### `with_name(name: str) -> Agent` + + Override the agent name. + + ### `with_sal(config: SalConfig) -> Agent` + + Set SAL (Selective Attention Locking) configuration. + + ### `with_advanced_features(features: AdvancedFeatures) -> Agent` + + Set advanced features (e.g. `{'enable_rtm': True}`). + + When `enable_rtm=True`, AgentKit defaults `parameters.data_channel` to `"rtm"` unless you explicitly set another data channel. + + ### `with_tools(enabled: bool = True) -> Agent` + + Enable or disable MCP tool invocation by setting `advanced_features.enable_tools`. + + ### `with_parameters(parameters: SessionParams) -> Agent` + + Set session parameters (silence config, farewell config, data channel, audio scenario, etc.). + + ### `with_audio_scenario(audio_scenario: ParametersAudioScenario) -> Agent` + + Set `parameters.audio_scenario` without replacing existing session parameters. + + ### `with_failure_message(message: str) -> Agent` + + Set the message spoken via TTS when the LLM call fails. + + ### `with_max_history(max_history: int) -> Agent` + + Set the maximum conversation history length for the standard ASR + LLM + TTS pipeline. The v2.7 MLLM core type does not expose `max_history`. + + ### `with_geofence(geofence: GeofenceConfig) -> Agent` + + Set geofence configuration (restricts backend server regions). + + ### `with_labels(labels: Dict[str, str]) -> Agent` + + Set custom labels (key-value pairs returned in notification callbacks). + + ### `with_rtc(rtc: RtcConfig) -> Agent` + + Set RTC configuration. + + ### `with_filler_words(filler_words: FillerWordsConfig) -> Agent` + + Set filler words configuration (played while waiting for LLM response). + + ## `create_session()` + + + ```python + create_session( + client: Any, + channel: str, + agent_uid: str, + remote_uids: List[str], + name: Optional[str] = None, + token: Optional[str] = None, + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + expires_in: Optional[int] = None, + ) -> AgentSession + ``` + + Creates an `AgentSession` bound to the given client and channel. + + | Parameter | Type | Required | Description | + |---|---|---|---| + | `client` | `Agora` or `AsyncAgora` | Yes | Authenticated client | + | `channel` | `str` | Yes | Channel name | + | `agent_uid` | `str` | Yes | UID for the agent | + | `remote_uids` | `List[str]` | Yes | UIDs of remote participants | + | `name` | `Optional[str]` | No | Session name (defaults to agent name) | + | `token` | `Optional[str]` | No | Pre-built RTC+RTM token | + | `expires_in` | `Optional[int]` | No | Token lifetime in seconds (default: `86400` = 24 h, Agora max). Only applies when the token is auto-generated. Use `expires_in_hours()` or `expires_in_minutes()` for clarity. Valid range: 1–86400. | + | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | + | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | + + **Returns:** `AgentSession` + + ## `to_properties()` + + Converts the agent configuration into a `StartAgentsRequestProperties` object for the Agora API. Called internally by `AgentSession.start()`. + + + ```python + to_properties( + channel: str, + agent_uid: str, + remote_uids: List[str], + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + token: Optional[str] = None, + app_id: Optional[str] = None, + app_certificate: Optional[str] = None, + expires_in: Optional[int] = None, + ) -> StartAgentsRequestProperties + ``` + + **Raises:** `ValueError` if neither `token` nor `app_id`+`app_certificate` is provided, or if required vendors (LLM, TTS) are missing in cascading mode. + + ## Properties + + | Property | Type | Description | + |---|---|---| + | `name` | `Optional[str]` | Agent name | + | `instructions` | `Optional[str]` | System prompt | + | `greeting` | `Optional[str]` | Greeting message | + | `failure_message` | `Optional[str]` | Message spoken when LLM fails | + | `max_history` | `Optional[int]` | Max conversation history length | + | `llm` | `Optional[Dict[str, Any]]` | LLM config dict (from `to_config()`) | + | `tts` | `Optional[Dict[str, Any]]` | TTS config dict | + | `stt` | `Optional[Dict[str, Any]]` | STT config dict | + | `mllm` | `Optional[Dict[str, Any]]` | MLLM config dict | + | `avatar` | `Optional[Dict[str, Any]]` | Avatar config dict | + | `turn_detection` | `Optional[TurnDetectionConfig]` | Turn detection settings | + | `sal` | `Optional[SalConfig]` | SAL configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | Advanced features | + | `parameters` | `Optional[SessionParams]` | Session parameters | + | `geofence` | `Optional[GeofenceConfig]` | Geofence configuration | + | `labels` | `Optional[Dict[str, str]]` | Custom labels | + | `rtc` | `Optional[RtcConfig]` | RTC configuration | + | `filler_words` | `Optional[FillerWordsConfig]` | Filler words configuration | + | `config` | `Dict[str, Any]` | Full configuration dict | + + ## Type aliases + + Public aliases over Fern-generated types: `LlmConfig`, `SttConfig`, `AsrConfig` (= `SttConfig`), `MllmConfig`, `AvatarConfig`, session/conversation types, and think types (`ThinkOnListeningAction`, etc.). + + Think value constants: `ThinkOnListeningActionInject`, `ThinkOnListeningActionInterrupt`, `ThinkOnListeningActionIgnore`, `ThinkOnThinkingActionInterrupt`, `ThinkOnThinkingActionIgnore`, `ThinkOnSpeakingActionInterrupt`, `ThinkOnSpeakingActionIgnore`. + + ## Cross-SDK discovery map + + | Concept | Python | TypeScript | Go | + |---|---|---|---| + | STT payload alias (wire: `asr`) | `SttConfig` / `AsrConfig` | `SttConfig` / `AsrConfig` | `AsrConfig` / `SttConfig` | + | xAI MLLM (primary) | `XaiGrok` | `XaiGrok` | `XaiGrok` / `NewXaiGrok` | + | Avatar token helper | `is_avatar_token_managed` | `isAvatarTokenManaged` | `IsAvatarTokenManaged` | + | Think inject constant | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | + status: unresolved + - id: patch-7465fada + content_hash: sha256:9c6ed2e5f48702293eed8b213cc31cce63a7ed5a1ad16a0b23e791c13e77746f + original_commit: 7465fadafa0f1e62051d99b42d0eeda85f31eeee + original_message: "fix(agentkit): resolve Python session typing issues" + original_author: digitallysavvy + base_generation: 403a1a9ac18fd05139ac0140d1837d28f5805dcb + files: + - src/agora_agent/agentkit/agent_session.py + patch_content: | + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index a749d1e..ddcd930 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -24,6 +24,7 @@ from .avatar_types import ( + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + theirs_snapshot: + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent, GetTurnsOptions, SayOptions, ThinkOptions + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token, _parse_numeric_uid + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(self._agent_uid, "agent_uid"), + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(str(params["agora_uid"]), "avatar agora_uid"), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.greeting_configs is not None: + llm["greeting_configs"] = self._dump_model(self._agent.greeting_configs) + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @staticmethod + def _response_pagination(response: typing.Any) -> typing.Any: + if isinstance(response, dict): + return response.get("pagination") + return getattr(response, "pagination", None) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + - id: patch-d29165c4 + content_hash: sha256:be59d1d3efc435d5e0b83305b2cd39ce3dad4534a4125de18028c137e692e659 + original_commit: d29165c4ddd8296af703a4e9ed848516f563dd1b + original_message: make python compat package publishable + original_author: chenyuguo + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - compat/agora-agent-server-sdk/README.md + - compat/agora-agent-server-sdk/pyproject.toml + - compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + patch_content: |+ + From d29165c4ddd8296af703a4e9ed848516f563dd1b Mon Sep 17 00:00:00 2001 + From: chenyuguo + Date: Wed, 27 May 2026 17:24:50 +0800 + Subject: [PATCH] make python compat package publishable + + --- + compat/agora-agent-server-sdk/README.md | 2 ++ + compat/agora-agent-server-sdk/pyproject.toml | 3 +++ + .../src/agora_agent_server_sdk_compat/__init__.py | 1 + + 3 files changed, 6 insertions(+) + create mode 100644 compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + + diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md + index 1388836..cff3cfe 100644 + --- a/compat/agora-agent-server-sdk/README.md + +++ b/compat/agora-agent-server-sdk/README.md + @@ -9,3 +9,5 @@ pip install agora-agents + ``` + + This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. + + + +It intentionally contains only a minimal compatibility module so the distribution can be built and published cleanly with Poetry. + diff --git a/compat/agora-agent-server-sdk/pyproject.toml b/compat/agora-agent-server-sdk/pyproject.toml + index 8efbe53..ac93128 100644 + --- a/compat/agora-agent-server-sdk/pyproject.toml + +++ b/compat/agora-agent-server-sdk/pyproject.toml + @@ -26,6 +26,9 @@ classifiers = [ + "Topic :: Software Development :: Libraries :: Python Modules", + "Typing :: Typed" + ] + +packages = [ + + { include = "agora_agent_server_sdk_compat", from = "src"} + +] + + [tool.poetry.urls] + Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python' + diff --git a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + new file mode 100644 + index 0000000..55522c6 + --- /dev/null + +++ b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + @@ -0,0 +1 @@ + +"""Compatibility package for the renamed agora-agents distribution.""" + -- + 2.52.0 + + theirs_snapshot: + compat/agora-agent-server-sdk/README.md: | + # agora-agent-server-sdk + + This package has been renamed to `agora-agents`. + + New projects should install: + + ```sh + pip install agora-agents + ``` + + This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. + + It intentionally contains only a minimal compatibility module so the distribution can be built and published cleanly with Poetry. + compat/agora-agent-server-sdk/pyproject.toml: | + [project] + name = "agora-agent-server-sdk" + + [tool.poetry] + name = "agora-agent-server-sdk" + version = "v2.0.0" + description = "Compatibility shim for the renamed agora-agents package." + readme = "README.md" + authors = [] + keywords = [] + + classifiers = [ + "Intended Audience :: Developers", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Operating System :: OS Independent", + "Operating System :: POSIX", + "Operating System :: MacOS", + "Operating System :: POSIX :: Linux", + "Operating System :: Microsoft :: Windows", + "Topic :: Software Development :: Libraries :: Python Modules", + "Typing :: Typed" + ] + packages = [ + { include = "agora_agent_server_sdk_compat", from = "src"} + ] + + [tool.poetry.urls] + Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python' + + [tool.poetry.dependencies] + python = "^3.8" + agora-agents = ">=2.0.0,<3.0.0" + + [build-system] + requires = ["poetry-core"] + build-backend = "poetry.core.masonry.api" + compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py: | + """Compatibility package for the renamed agora-agents distribution.""" + status: unresolved + - id: patch-fae1249a + content_hash: sha256:01bf21f3cc4c784dfcff80a48c9c7bb3123af4327a567b7c990b528e9780e9a2 + original_commit: fae1249a20c53761a2eb5515a1bf92ca666760d1 + original_message: Re-export agora-agents API from legacy PyPI compatibility package The compat distribution delegates to agora_agent via __getattr__ and documents both import paths in its README. + original_author: digitallysavvy + base_generation: 403a1a9ac18fd05139ac0140d1837d28f5805dcb + files: + - compat/agora-agent-server-sdk/README.md + - compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + patch_content: |+ + From fae1249a20c53761a2eb5515a1bf92ca666760d1 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 27 May 2026 16:58:18 -0400 + Subject: [PATCH] Re-export agora-agents API from legacy PyPI compatibility + package The compat distribution delegates to agora_agent via __getattr__ and + documents both import paths in its README. + + --- + compat/agora-agent-server-sdk/README.md | 7 +++++-- + .../src/agora_agent_server_sdk_compat/__init__.py | 14 +++++++++++++- + 2 files changed, 18 insertions(+), 3 deletions(-) + + diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md + index cff3cfe..e43d1d8 100644 + --- a/compat/agora-agent-server-sdk/README.md + +++ b/compat/agora-agent-server-sdk/README.md + @@ -8,6 +8,9 @@ New projects should install: + pip install agora-agents + ``` + + -This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. + +This compatibility package re-exports the public API from `agora-agents` to support existing installs during the migration window. The primary import path remains `agora_agent`; you can also import from `agora_agent_server_sdk_compat`: + + -It intentionally contains only a minimal compatibility module so the distribution can be built and published cleanly with Poetry. + +```python + +from agora_agent import Agora, Area + +from agora_agent_server_sdk_compat import Agora, Area + +``` + diff --git a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + index 55522c6..6283244 100644 + --- a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + +++ b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + @@ -1 +1,13 @@ + -"""Compatibility package for the renamed agora-agents distribution.""" + +"""Compatibility re-exports for the renamed agora-agents package.""" + + + +import agora_agent as _agora_agent + + + +__all__ = getattr(_agora_agent, "__all__", []) + + + + + +def __getattr__(name: str): + + return getattr(_agora_agent, name) + + + + + +def __dir__(): + + return dir(_agora_agent) + -- + 2.52.0 + + theirs_snapshot: + compat/agora-agent-server-sdk/README.md: | + # agora-agent-server-sdk + + This package has been renamed to `agora-agents`. + + New projects should install: + + ```sh + pip install agora-agents + ``` + + This compatibility package re-exports the public API from `agora-agents` to support existing installs during the migration window. The primary import path remains `agora_agent`; you can also import from `agora_agent_server_sdk_compat`: + + ```python + from agora_agent import Agora, Area + from agora_agent_server_sdk_compat import Agora, Area + ``` + compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py: | + """Compatibility re-exports for the renamed agora-agents package.""" + + import agora_agent as _agora_agent + + __all__ = getattr(_agora_agent, "__all__", []) + + + def __getattr__(name: str): + return getattr(_agora_agent, name) + + + def __dir__(): + return dir(_agora_agent) + user_owned: true + - id: patch-fc9d93c3 + content_hash: sha256:93877741bdad745fda5dd549d7c3dd6bc315f4574aabd2defb52c0c795bff011 + original_commit: fc9d93c3026a6109d8a5e8b386418592f8d121c5 + original_message: Document agora-agents PyPI install name and migration notes + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/getting-started/installation.md + patch_content: | + diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md + index c14bdb2..f6f1750 100644 + --- a/docs/getting-started/installation.md + +++ b/docs/getting-started/installation.md + @@ -13,13 +13,13 @@ description: Install the Agora Conversational AI Python SDK. + ## Install with pip + + ```sh + -pip install agora-agent-sdk + +pip install agora-agents + ``` + + ## Install with Poetry + + ```sh + -poetry add agora-agent-sdk + +poetry add agora-agents + ``` + + ## Dependencies + theirs_snapshot: + docs/getting-started/installation.md: | + --- + sidebar_position: 1 + title: Installation + description: Install the Agora Conversational AI Python SDK. + --- + + # Installation + + ## Prerequisites + + - Python >= 3.8 + + ## Install with pip + + ```sh + pip install agora-agents + ``` + + ## Install with Poetry + + ```sh + poetry add agora-agents + ``` + + ## Dependencies + + The following packages are installed automatically: + + | Package | Purpose | + |---|---| + | `httpx` (>= 0.21.2) | HTTP client for sync and async requests | + | `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | + | `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | + + ## Sync vs. Async + + The SDK supports both synchronous and asynchronous usage: + + - **Synchronous** — import `Agora` from `agora_agent` and use blocking method calls + - **Asynchronous** — import `AsyncAgora` and `AsyncAgentSession` from `agora_agent` and use `await` with all API calls + + ```python + # Sync + from agora_agent import Agora, Area + + # Async + from agora_agent import AsyncAgora, AsyncAgentSession, Area + ``` + + Both clients share the same constructor parameters and capabilities. See [Authentication](./authentication.md) for setup details. + status: unresolved + - id: patch-44c21c14 + content_hash: sha256:34f08060a06ca824943ab02e75c3c83ad43a1b6e7d09ec6f8fa244ef82de6fcd + original_commit: 44c21c14a14aa7ad469a18ce86024ff14ca2bf9b + original_message: Re-export AgentKit symbols from agora_agent package root Extend __getattr__ and __all__ so vendor classes, presets, and helpers are importable via `from agora_agent import ...`. Add tests and update class docstring examples to use the root import path. + original_author: digitallysavvy + base_generation: 403a1a9ac18fd05139ac0140d1837d28f5805dcb + files: + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + - tests/custom/test_root_exports.py + patch_content: | + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index f84862c..0d7a4aa 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -67,6 +67,8 @@ from ..agents.types.start_agents_request_properties_filler_words_content import + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + +from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + +from ..types.tts import Tts + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index a749d1e..ddcd930 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -24,6 +24,7 @@ from .avatar_types import ( + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + theirs_snapshot: + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.list_agents_response_data_list_item_status import ListAgentsResponseDataListItemStatus + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection + from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode + from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs + from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode + from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse, + ) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + AsrConfig = SttConfig + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + MllmVendor = StartAgentsRequestPropertiesMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + SessionStatus = ListAgentsResponseDataListItemStatus + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + + class ThinkOptions(typing_extensions.TypedDict, total=False): + on_listening_action: AgentThinkAgentManagementRequestOnListeningAction + on_thinking_action: AgentThinkAgentManagementRequestOnThinkingAction + on_speaking_action: AgentThinkAgentManagementRequestOnSpeakingAction + interruptable: bool + metadata: typing.Dict[str, str] + + + class GetTurnsOptions(typing_extensions.TypedDict, total=False): + page_index: int + page_size: int + + + class SayOptions(typing_extensions.TypedDict, total=False): + priority: SpeakAgentsRequestPriority + interruptable: bool + + + class SessionOptions(typing_extensions.TypedDict, total=False): + name: str + channel: str + token: str + agent_uid: str + remote_uids: typing.List[str] + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + # LLM sub-type aliases + LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs + LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode + McpServersItem = StartAgentsRequestPropertiesLlmMcpServersItem + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + # Think type aliases and response + ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + ThinkResponse = AgentThinkAgentManagementResponse + + from .token import generate_convo_ai_token, _parse_numeric_uid, _validate_expires_in + + + def _dump_optional_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Examples + -------- + >>> from agora_agent import Agent, OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + greeting_configs: typing.Optional[LlmGreetingConfigs] = None, + ): + self._name = name + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + self._greeting_configs = greeting_configs + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_greeting_configs(self, configs: LlmGreetingConfigs) -> "Agent": + """Returns a new Agent with greeting playback configuration.""" + new_agent = self._clone() + new_agent._greeting_configs = configs + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + The failure message is played via TTS when the LLM call fails. + """ + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Returns a new Agent with the specified maximum conversation history length.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def greeting_configs(self) -> typing.Optional[LlmGreetingConfigs]: + return self._greeting_configs + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + "greeting_configs": self._greeting_configs, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + uid=_parse_numeric_uid(agent_uid, "agent_uid"), + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._greeting_configs is not None: + llm_config["greeting_configs"] = _dump_optional_model(self._greeting_configs) + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + if self._stt is not None: + base_kwargs["asr"] = self._stt + + return StartAgentsRequestProperties(**base_kwargs) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + new_agent._greeting_configs = self._greeting_configs + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent, GetTurnsOptions, SayOptions, ThinkOptions + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token, _parse_numeric_uid + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(self._agent_uid, "agent_uid"), + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(str(params["agora_uid"]), "avatar agora_uid"), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.greeting_configs is not None: + llm["greeting_configs"] = self._dump_model(self._agent.greeting_configs) + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @staticmethod + def _response_pagination(response: typing.Any) -> typing.Any: + if isinstance(response, dict): + return response.get("pagination") + return getattr(response, "pagination", None) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + tests/custom/test_root_exports.py: | + import pytest + + import agora_agent + import agora_agent.agentkit as agentkit + + + def test_root_exports_match_agentkit_for_common_symbols() -> None: + for name in ("Agent", "DeepgramSTT", "OpenAI", "AgentPresets", "generate_rtc_token", "DataChannel"): + assert getattr(agora_agent, name) is getattr(agentkit, name) + + + def test_root_exports_fern_client_symbols() -> None: + assert agora_agent.Agora is not None + assert agora_agent.Area is not None + assert agora_agent.AsyncAgora is not None + + + def test_unknown_root_export_raises_attribute_error() -> None: + with pytest.raises(AttributeError): + _ = agora_agent.NotARealExportName + + + def test_dir_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in dir(agora_agent) + + + def test_all_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in agora_agent.__all__ + assert "OpenAI" in agora_agent.__all__ + - id: patch-d475306b + content_hash: sha256:407af5e7564d6e8d0b91f1e117cb433aec931f083225af53c6df2abfff281b22 + original_commit: d475306bd42279984bcf4934b900003e8e02c4eb + original_message: Move package rename guidance to installation docs and protect manual paths in Fern ignore. Consolidate migration notes into the installation guide with next-step links, add a brief README pointer, and exclude README, compat, and workflow files from Fern generation. + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - compat/agora-agent-server-sdk/README.md + - docs/getting-started/installation.md + patch_content: | + diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md + index e43d1d8..1da36aa 100644 + --- a/compat/agora-agent-server-sdk/README.md + +++ b/compat/agora-agent-server-sdk/README.md + @@ -14,3 +14,5 @@ This compatibility package re-exports the public API from `agora-agents` to supp + from agora_agent import Agora, Area + from agora_agent_server_sdk_compat import Agora, Area + ``` + + + +Maintainers: dual-publish steps live in the repository release workflow, not in the root README. + diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md + index 04b48da..8fca9ab 100644 + --- a/docs/getting-started/installation.md + +++ b/docs/getting-started/installation.md + @@ -53,4 +53,15 @@ from agora_agent import AsyncAgora, AsyncAgentSession, Area + | `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | + | `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | + + -See [Authentication](./authentication.md) for setup details. + +## Next steps + + + +- [Authentication](./authentication.md) — configure your credentials + +- [Quick Start](./quick-start.md) — build your first conversational agent + + + +## Migrating from a previous package name + + + +The PyPI distribution was renamed from `agora-agent-server-sdk` to `agora-agents` in v2.0.0. Install `agora-agents`; the import path remains `agora_agent`. + + + +The legacy PyPI name remains available as a compatibility shim that re-exports `agora-agents`. See [compat/agora-agent-server-sdk](../../compat/agora-agent-server-sdk/README.md). + + + +For release and version details, see [changelog — Migration notes](../../changelog.md#migration-notes). + theirs_snapshot: + compat/agora-agent-server-sdk/README.md: | + # agora-agent-server-sdk + + This package has been renamed to `agora-agents`. + + New projects should install: + + ```sh + pip install agora-agents + ``` + + This compatibility package re-exports the public API from `agora-agents` to support existing installs during the migration window. The primary import path remains `agora_agent`; you can also import from `agora_agent_server_sdk_compat`: + + ```python + from agora_agent import Agora, Area + from agora_agent_server_sdk_compat import Agora, Area + ``` + + Maintainers: dual-publish steps live in the repository release workflow, not in the root README. + docs/getting-started/installation.md: | + --- + sidebar_position: 1 + title: Installation + description: Install the Agora Conversational AI Python SDK. + --- + + # Installation + + ## Prerequisites + + - Python >= 3.8 + + ## Install with pip + + ```sh + pip install agora-agents + ``` + + ## Install with Poetry + + ```sh + poetry add agora-agents + ``` + + ## Imports + + ```python + from agora_agent import Agent, Agora, Area, DeepgramSTT, OpenAI + ``` + + The package installs as `agora-agents` and imports as `agora_agent`. + + ## Sync vs. Async + + The SDK supports both synchronous and asynchronous usage: + + - **Synchronous** — import `Agora` from `agora_agent` and use blocking method calls + - **Asynchronous** — import `AsyncAgora` and `AsyncAgentSession` from `agora_agent` and use `await` with all API calls + + ```python + # Sync + from agora_agent import Agora, Area + + # Async + from agora_agent import AsyncAgora, AsyncAgentSession, Area + ``` + + ## Dependencies + + | Package | Purpose | + | ------------------------------ | ------------------------------------------------------ | + | `httpx` (>= 0.21.2) | HTTP client for sync and async requests | + | `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | + | `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | + + ## Next steps + + - [Authentication](./authentication.md) — configure your credentials + - [Quick Start](./quick-start.md) — build your first conversational agent + + ## Migrating from a previous package name + + The PyPI distribution was renamed from `agora-agent-server-sdk` to `agora-agents` in v2.0.0. Install `agora-agents`; the import path remains `agora_agent`. + + The legacy PyPI name remains available as a compatibility shim that re-exports `agora-agents`. See [compat/agora-agent-server-sdk](../../compat/agora-agent-server-sdk/README.md). + + For release and version details, see [changelog — Migration notes](../../changelog.md#migration-notes). + status: unresolved + - id: patch-c9355576 + content_hash: sha256:83b3b6148b21f2b4d53ee67321777522f5f4e871b61ea3b23f3a6b88ca052769 + original_commit: c93555763ffd63267a737b3e430217a890f203db + original_message: Streamline Python docs and README for app-credentials-first onboarding. Remove duplicated low-level client examples from the README, de-emphasize legacy auth modes, refocus the low-level API guide on AgentKit with telephony escape hatches, and update Agora-managed model terminology. + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/getting-started/authentication.md + - docs/guides/low-level-api.md + patch_content: | + diff --git a/docs/getting-started/authentication.md b/docs/getting-started/authentication.md + index 31dcc56..74c62cd 100644 + --- a/docs/getting-started/authentication.md + +++ b/docs/getting-started/authentication.md + @@ -46,41 +46,6 @@ session = agent.create_session( + print(client.auth_mode) # "app-credentials" + ``` + + -## Other auth modes + +## Legacy auth modes + + -The SDK also supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. These are not recommended for new applications. + - + -### Token auth (`auth_token`) + - + -Pass a pre-minted Agora REST token on the client. You must also supply the RTC join token on `create_session(..., token=...)`. + - + -```python + -client = Agora( + - area=Area.US, + - app_id="your-app-id", + - app_certificate="your-app-certificate", + - auth_token="your-rest-auth-token", + -) + - + -session = agent.create_session( + - client, + - channel="room-123", + - agent_uid="1", + - remote_uids=["100"], + - token="your-rtc-join-token", + -) + -``` + - + -### Basic Auth (`customer_id` + `customer_secret`) + - + -Uses HTTP Basic Auth with Customer ID and Secret from Agora Console. Avoid for new integrations — the same credentials are sent on every request instead of minting fresh tokens. + - + -```python + -client = Agora( + - area=Area.US, + - app_id="your-app-id", + - app_certificate="your-app-certificate", + - customer_id="your-customer-id", + - customer_secret="your-customer-secret", + -) + -``` + +The generated client still supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. Do not use those modes for new session integrations. Use app credentials so AgentKit can mint short-lived ConvoAI REST auth and RTC join tokens for each session. + diff --git a/docs/guides/low-level-api.md b/docs/guides/low-level-api.md + index 6677b45..47397b7 100644 + --- a/docs/guides/low-level-api.md + +++ b/docs/guides/low-level-api.md + @@ -1,187 +1,55 @@ + --- + sidebar_position: 10 + title: Low-Level API + -description: Direct client.agents.start() usage without the builder pattern. + +description: Use generated clients for escape-hatch APIs while keeping agent sessions on AgentKit. + --- + + # Low-Level API + + -For full control over request payloads you can call the generated clients directly and pass raw types such as `StartAgentsRequestProperties`, `Tts_Elevenlabs`, and `StartAgentsRequestPropertiesAsr`. Use this when you need vendor or options not exposed by the agentkit, or when integrating with generated types from the API spec. + +Use the `Agent` builder and `AgentSession` for conversational agent starts. That path generates ConvoAI REST auth and RTC join tokens from `app_id` and `app_certificate`, so application code does not need prebuilt REST tokens, RTC tokens, Customer ID, or Customer Secret. + + -## Raw telephony and phone-number APIs + - + -AgentKit focuses on realtime agent session helpers. Telephony call status, call hangup, and phone-number management are exposed through the generated low-level clients: + - + -- `client.telephony` for call status and hangup operations + -- `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + +Generated clients are still available for API surface that AgentKit does not wrap yet, such as telephony and phone-number management. + + -## Cascading flow (ASR → LLM → TTS) + +## Client setup + + ```python + from agora_agent import Agora, Area + -from agora_agent.agents import ( + - StartAgentsRequestProperties, + - StartAgentsRequestPropertiesAsr, + - StartAgentsRequestPropertiesLlm, + -) + -from agora_agent.types.eleven_labs_tts_params import ElevenLabsTtsParams + -from agora_agent.types.tts import Tts_Elevenlabs + + client = Agora( + area=Area.US, + - app_id="YOUR_APP_ID", + - app_certificate="YOUR_APP_CERTIFICATE", + - auth_token="your-rest-auth-token", + -) + -client.agents.start( + - client.app_id, + - name="unique_name", + - properties=StartAgentsRequestProperties( + - channel="channel_name", + - token="token", + - agent_rtc_uid="1001", + - remote_rtc_uids=["1002"], + - idle_timeout=120, + - asr=StartAgentsRequestPropertiesAsr( + - language="en-US", + - vendor="deepgram", + - params={"api_key": "YOUR_DEEPGRAM_API_KEY"}, + - ), + - tts=Tts_Elevenlabs( + - params=ElevenLabsTtsParams( + - key="YOUR_ELEVENLABS_API_KEY", + - model_id="eleven_flash_v2_5", + - voice_id="pNInz6obpgDQGcFmaJgB", + - sample_rate=24000, + - ), + - ), + - llm=StartAgentsRequestPropertiesLlm( + - url="https://api.openai.com/v1/chat/completions", + - api_key="", + - system_messages=[ + - {"role": "system", "content": "You are a helpful chatbot."} + - ], + - params={"model": "gpt-4o-mini"}, + - max_history=32, + - greeting_message="Hello, how can I assist you today?", + - failure_message="Please hold on a second.", + - ), + - ), + + app_id="your-app-id", + + app_certificate="your-app-certificate", + ) + ``` + + -## Async (low-level) + +## Raw telephony and phone-number APIs + + -```python + -import asyncio + -from agora_agent import Area, AsyncAgora + -from agora_agent.agents import ( + - StartAgentsRequestProperties, + - StartAgentsRequestPropertiesAsr, + - StartAgentsRequestPropertiesLlm, + -) + -from agora_agent.types.eleven_labs_tts_params import ElevenLabsTtsParams + -from agora_agent.types.tts import Tts_Elevenlabs + +AgentKit focuses on realtime agent session helpers. Use generated clients for operational APIs: + + -client = AsyncAgora( + - area=Area.US, + - app_id="YOUR_APP_ID", + - app_certificate="YOUR_APP_CERTIFICATE", + - auth_token="your-rest-auth-token", + +- `client.telephony` for call status and hangup operations + +- `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + + + +```python + +calls = client.telephony.list( + + appid=client.app_id, + + type="sip", + ) + + -async def main() -> None: + - await client.agents.start( + - client.app_id, + - name="unique_name", + - properties=StartAgentsRequestProperties( + - channel="channel_name", + - token="token", + - agent_rtc_uid="1001", + - remote_rtc_uids=["1002"], + - idle_timeout=120, + - asr=StartAgentsRequestPropertiesAsr( + - language="en-US", + - vendor="deepgram", + - params={"api_key": "YOUR_DEEPGRAM_API_KEY"}, + - ), + - tts=Tts_Elevenlabs( + - params=ElevenLabsTtsParams( + - key="YOUR_ELEVENLABS_API_KEY", + - model_id="eleven_flash_v2_5", + - voice_id="pNInz6obpgDQGcFmaJgB", + - sample_rate=24000, + - ), + - ), + - llm=StartAgentsRequestPropertiesLlm( + - url="https://api.openai.com/v1/chat/completions", + - api_key="", + - system_messages=[ + - {"role": "system", "content": "You are a helpful chatbot."} + - ], + - params={"model": "gpt-4o-mini"}, + - max_history=32, + - greeting_message="Hello, how can I assist you today?", + - failure_message="Please hold on a second.", + - ), + - ), + - ) + - + -asyncio.run(main()) + +for call in calls: + + print(call.id, call.state) + ``` + + -## MLLM flow (multimodal) + +## Direct agent APIs + + -For real-time audio with OpenAI Realtime or Google Gemini Live, use the MLLM flow instead of the cascading ASR → LLM → TTS flow. See the [MLLM Overview](https://docs.agora.io/en/conversational-ai/models/mllm/overview). + +`client.agents` exposes the generated REST surface for advanced integrations. Prefer `agent.create_session(...).start()` for new session starts because it handles auth, token generation, vendor serialization, lifecycle state, and avatar enrichment. + + -```python + -from agora_agent import Agora, Area + -from agora_agent.agents import ( + - StartAgentsRequestProperties, + - StartAgentsRequestPropertiesMllm, + - StartAgentsRequestPropertiesMllmVendor, + - StartAgentsRequestPropertiesTts, + - StartAgentsRequestPropertiesTtsVendor, + - StartAgentsRequestPropertiesLlm, + -) + +If you need an endpoint that is not wrapped by `AgentSession`, use `session.raw` after creating the session: + + -client = Agora( + - area=Area.US, + - app_id="YOUR_APP_ID", + - app_certificate="YOUR_APP_CERTIFICATE", + - auth_token="your-rest-auth-token", + -) + - + -client.agents.start( + - client.app_id, + - name="mllm_agent", + - properties=StartAgentsRequestProperties( + - channel="channel_name", + - token="your_token", + - agent_rtc_uid="1001", + - remote_rtc_uids=["1002"], + - idle_timeout=120, + - mllm=StartAgentsRequestPropertiesMllm( + - enable=True, + - url="wss://api.openai.com/v1/realtime", + - api_key="", + - vendor=StartAgentsRequestPropertiesMllmVendor.OPENAI, + - params={ + - "model": "gpt-4o-realtime-preview", + - "voice": "alloy", + - }, + - input_modalities=["audio"], + - output_modalities=["text", "audio"], + - greeting_message="Hello! I'm ready to chat in real-time.", + - turn_detection={ + - "mode": "server_vad", + - "server_vad_config": { + - "idle_timeout_ms": 5000, + - }, + - }, + - ), + - ), + +```python + +info = session.raw.get( + + appid=session.app_id, + + agent_id=session.id, + ) + ``` + + -For more on the agentkit-based MLLM flow, see [MLLM Flow](./mllm-flow.md). + +You must pass `appid` and `agent_id` manually when using generated raw methods. + theirs_snapshot: + docs/getting-started/authentication.md: | + --- + sidebar_position: 2 + title: Authentication + description: Configure the Python SDK with app credentials and understand other supported auth modes. + --- + + # Authentication + + Create `Agora` or `AsyncAgora` with `app_id` and `app_certificate` only. The SDK mints a fresh ConvoAI REST token for each API call and generates the RTC join token when the session starts. + + ## App credentials + + ```python + from agora_agent import Agent, Agora, Area, DeepgramSTT, OpenAI, MiniMaxTTS + + client = Agora( + area=Area.US, + app_id="your-app-id", + app_certificate="your-app-certificate", + ) + + agent = ( + Agent(instructions="Be concise.") + .with_stt(DeepgramSTT(model="nova-3")) + .with_llm(OpenAI(model="gpt-4o-mini")) + .with_tts(MiniMaxTTS(model="speech_2_6_turbo", voice_id="English_captivating_female1")) + ) + + session = agent.create_session( + client, + channel="room-123", + agent_uid="1", + remote_uids=["100"], + ) + ``` + + ## Why app credentials + + - Fresh short-lived tokens per API call instead of reusing long-lived credentials + - No Customer ID / Customer Secret in request headers + - No manual REST or RTC token provisioning in application code + + ## Inspecting auth mode + + ```python + print(client.auth_mode) # "app-credentials" + ``` + + ## Legacy auth modes + + The generated client still supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. Do not use those modes for new session integrations. Use app credentials so AgentKit can mint short-lived ConvoAI REST auth and RTC join tokens for each session. + docs/guides/low-level-api.md: | + --- + sidebar_position: 10 + title: Low-Level API + description: Use generated clients for escape-hatch APIs while keeping agent sessions on AgentKit. + --- + + # Low-Level API + + Use the `Agent` builder and `AgentSession` for conversational agent starts. That path generates ConvoAI REST auth and RTC join tokens from `app_id` and `app_certificate`, so application code does not need prebuilt REST tokens, RTC tokens, Customer ID, or Customer Secret. + + Generated clients are still available for API surface that AgentKit does not wrap yet, such as telephony and phone-number management. + + ## Client setup + + ```python + from agora_agent import Agora, Area + + client = Agora( + area=Area.US, + app_id="your-app-id", + app_certificate="your-app-certificate", + ) + ``` + + ## Raw telephony and phone-number APIs + + AgentKit focuses on realtime agent session helpers. Use generated clients for operational APIs: + + - `client.telephony` for call status and hangup operations + - `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + + ```python + calls = client.telephony.list( + appid=client.app_id, + type="sip", + ) + + for call in calls: + print(call.id, call.state) + ``` + + ## Direct agent APIs + + `client.agents` exposes the generated REST surface for advanced integrations. Prefer `agent.create_session(...).start()` for new session starts because it handles auth, token generation, vendor serialization, lifecycle state, and avatar enrichment. + + If you need an endpoint that is not wrapped by `AgentSession`, use `session.raw` after creating the session: + + ```python + info = session.raw.get( + appid=session.app_id, + agent_id=session.id, + ) + ``` + + You must pass `appid` and `agent_id` manually when using generated raw methods. + status: unresolved diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index f84862c..0d7a4aa 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -67,6 +67,8 @@ from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule from ..types.tts import Tts +from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule +from ..types.tts import Tts from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( AgentThinkAgentManagementRequestOnListeningAction, ) diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py index a749d1e..ddcd930 100644 --- a/src/agora_agent/agentkit/agent_session.py +++ b/src/agora_agent/agentkit/agent_session.py @@ -24,6 +24,7 @@ is_generic_avatar, is_heygen_avatar, is_live_avatar_avatar, + is_rtc_avatar, validate_avatar_config, validate_tts_sample_rate, ) diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py index 50bdd08..f48098c 100644 --- a/src/agora_agent/agentkit/vendors/avatar.py +++ b/src/agora_agent/agentkit/vendors/avatar.py @@ -177,6 +177,49 @@ def to_config(self) -> Dict[str, Any]: return {"enable": enable, "vendor": "generic", "params": params} +class GenericAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Generic avatar provider API key") + api_base_url: str = Field(..., description="Avatar provider API base URL") + avatar_id: str = Field(..., description="Avatar ID") + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + +class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "api_base_url": self.options.api_base_url, + "avatar_id": self.options.avatar_id, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_appid is not None: + params["agora_appid"] = self.options.agora_appid + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.agora_channel is not None: + params["agora_channel"] = self.options.agora_channel + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "generic", "params": params} + + class AnamAvatarOptions(BaseModel): model_config = ConfigDict(extra="forbid") diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py index b58f040..62cb3f2 100644 --- a/src/agora_agent/agentkit/vendors/mllm.py +++ b/src/agora_agent/agentkit/vendors/mllm.py @@ -1,3 +1,4 @@ +import warnings from typing import Any, Dict, List, Optional from pydantic import BaseModel, ConfigDict, Field diff --git a/tests/custom/test_agentkit_agent.py b/tests/custom/test_agentkit_agent.py new file mode 100644 index 0000000..9719b04 --- /dev/null +++ b/tests/custom/test_agentkit_agent.py @@ -0,0 +1,298 @@ +from agora_agent.agentkit import ( + Agent, + AvatarConfig, + AvatarVendor, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, + SttConfig, + SttVendor, + TtsConfig, +) +import pytest + +from agora_agent.agentkit.vendors import ( + AkoolAvatar, + ElevenLabsTTS, + LiveAvatarAvatar, + OpenAI, + OpenAIRealtime, +) + + +def _parameter(config, key): + parameters = config["parameters"] + if isinstance(parameters, dict): + return parameters[key] + return getattr(parameters, key) + + +class _CopyOnlyModel: + def __init__(self, **values): + self.values = values + + def copy(self, update=None): + return _CopyOnlyModel(**{**self.values, **(update or {})}) + + +def test_generated_core_aliases_are_public(): + assert LlmConfig is not None + assert LlmStyle is not None + assert SttConfig is not None + assert SttVendor is not None + assert TtsConfig is not None + assert MllmConfig is not None + assert MllmVendor is not None + assert AvatarConfig is not None + assert AvatarVendor is not None + + +def test_model_copy_helper_supports_pydantic_v1_copy_api(): + copied = Agent._copy_model_update(_CopyOnlyModel(enable_rtm=True), {"data_channel": "rtm"}) # noqa: SLF001 + + assert copied.values == {"enable_rtm": True, "data_channel": "rtm"} + + +def test_with_audio_scenario_sets_session_parameter(): + agent = Agent(name="test").with_audio_scenario("chorus") + + assert _parameter(agent.config, "audio_scenario") == "chorus" + + +def test_with_audio_scenario_preserves_existing_parameters(): + agent = Agent(name="test", parameters={"enable_metrics": True}).with_audio_scenario( + "chorus" + ) + + assert _parameter(agent.config, "enable_metrics") is True + assert _parameter(agent.config, "audio_scenario") == "chorus" + + +def test_enable_rtm_defaults_data_channel_to_rtm(): + properties = Agent(name="test", advanced_features={"enable_rtm": True}).to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + skip_vendor_validation=True, + ) + + assert properties.parameters.data_channel == "rtm" + + +def test_enable_rtm_preserves_explicit_data_channel(): + properties = Agent( + name="test", + advanced_features={"enable_rtm": True}, + parameters={"data_channel": "datastream"}, + ).to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + skip_vendor_validation=True, + ) + + assert properties.parameters.data_channel == "datastream" + + +def test_agent_level_llm_fields_override_vendor_defaults(): + agent = ( + Agent(name="test") + .with_llm( + OpenAI( + api_key="llm-key", + greeting_message="vendor greeting", + failure_message="vendor failure", + max_history=1, + ) + ) + .with_tts(ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + + properties = agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + ) + + assert properties.llm.greeting_message == "agent greeting" + assert properties.llm.failure_message == "agent failure" + assert properties.llm.max_history == 2 + + +def test_avatar_sample_rate_validation_works_when_tts_added_after_avatar(): + agent = Agent(name="test").with_avatar( + LiveAvatarAvatar(api_key="avatar-key", quality="medium", agora_uid="2") + ) + + with pytest.raises(ValueError, match="24000"): + agent.with_tts( + ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice", sample_rate=16000) + ) + + +def test_avatar_sample_rate_validation_uses_wrapper_sample_rate(): + agent = ( + Agent(name="test") + .with_avatar(AkoolAvatar(api_key="avatar-key")) + .with_tts( + ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice", sample_rate=16000) + ) + ) + + assert agent.tts_sample_rate == 16000 + + +def test_with_mllm_removes_deprecated_advanced_features_enable_mllm(): + properties = ( + Agent( + name="test", + advanced_features={"enable_mllm": True, "enable_rtm": True}, + greeting="hello from agent", + failure_message="try again", + max_history=5, + ) + .with_mllm(OpenAIRealtime(api_key="openai-key")) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None + assert properties.mllm.enable is True + assert properties.mllm.greeting_message == "hello from agent" + assert properties.mllm.failure_message == "try again" + mllm_dump = properties.mllm.model_dump(exclude_none=True) + assert "max_history" not in mllm_dump + assert properties.advanced_features is not None + af_dump = properties.advanced_features.model_dump(exclude_none=True) + assert "enable_mllm" not in af_dump + assert af_dump.get("enable_rtm") is True + + +def test_to_properties_rejects_mllm_with_enabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + + +def test_to_properties_mllm_with_avatar_fires_before_token_generation(): + """The guard must fire before the token-generation step so callers get a + clear, actionable error even when app_id/app_certificate are empty. + """ + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + app_id="", + app_certificate="", + ) + + +def test_to_properties_rejects_mllm_with_default_enabled_avatar(): + """Avatar with no `enable` field should be treated as enabled.""" + agent = Agent(name="test").with_mllm(OpenAIRealtime(api_key="mllm-key")) + agent._avatar = { # noqa: SLF001 + "vendor": "liveavatar", + "params": { + "api_key": "avatar-key", + "quality": "high", + "agora_uid": "200", + "agora_token": "avatar-token", + }, + } + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + + +def test_to_properties_allows_mllm_with_disabled_avatar_and_no_tts(): + properties = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + enable=False, + ) + ) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None and properties.mllm.enable is True + assert properties.tts is None + assert properties.llm is None + assert properties.asr is None + assert properties.avatar is not None and properties.avatar.enable is False + + +def test_to_properties_mllm_without_tts_or_llm_succeeds(): + properties = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None and properties.mllm.enable is True + assert properties.tts is None + assert properties.llm is None + assert properties.asr is None + assert properties.avatar is None diff --git a/tests/custom/test_agentkit_session.py b/tests/custom/test_agentkit_session.py new file mode 100644 index 0000000..198fcd0 --- /dev/null +++ b/tests/custom/test_agentkit_session.py @@ -0,0 +1,383 @@ +from types import SimpleNamespace + +import pytest + +from agora_agent.agentkit import Agent, AgentSession +from agora_agent.agentkit.vendors import ( + ElevenLabsTTS, + GenericAvatar, + LiveAvatarAvatar, + OpenAI, + OpenAIRealtime, + RimeTTS, +) +from agora_agent.agents.types.get_turns_agents_response import GetTurnsAgentsResponse + + +APP_ID = "0" * 32 +APP_CERTIFICATE = "1" * 32 + + +class _Agents: + def __init__(self): + self.calls = [] + self.start_calls = [] + + def start(self, app_id, name, properties, preset=None, pipeline_id=None, request_options=None): + self.start_calls.append((app_id, name, properties, preset, pipeline_id, request_options)) + return SimpleNamespace(agent_id="agent-1") + + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + self.calls.append((app_id, agent_id, page_index, page_size, request_options)) + is_last_page = page_index != 1 + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=2, + pagination={ + "page_index": page_index or 1, + "total_pages": 2, + "is_last_page": is_last_page, + }, + turns=[{"turn_id": float(page_index or 1)}], + ) + + +class _Client: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _Agents() + self.agent_management = object() + + +def _session(agent, warn=None): + return AgentSession( + client=_Client(), + agent=agent, + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + warn=warn, + ) + + +def test_generic_avatar_enrichment_adds_session_context_and_token(): + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + ) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + params = properties["avatar"]["params"] + assert params["agora_appid"] == APP_ID + assert params["agora_channel"] == "room" + assert params["agora_token"] + assert params["agora_token"] != properties["token"] + + +def test_generic_avatar_empty_session_fields_are_filled(): + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + agora_appid="", + agora_channel="", + agora_token="", + ) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + params = properties["avatar"]["params"] + assert params["agora_appid"] == APP_ID + assert params["agora_channel"] == "room" + assert params["agora_token"] + + +def test_avatar_uid_matching_agent_uid_warns(): + warnings = [] + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="1", + ) + ) + session = _session(agent, warn=warnings.append) + + session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert any("matches agent_rtc_uid" in warning for warning in warnings) + + +def test_session_start_properties_agent_level_llm_fields_override_vendor_defaults(): + agent = ( + Agent(name="test") + .with_llm( + OpenAI( + api_key="llm-key", + greeting_message="vendor greeting", + failure_message="vendor failure", + max_history=1, + ) + ) + .with_tts(ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["llm"]["greeting_message"] == "agent greeting" + assert properties["llm"]["failure_message"] == "agent failure" + assert properties["llm"]["max_history"] == 2 + + +def test_session_start_properties_applies_mllm_agent_level_defaults(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["mllm"]["greeting_message"] == "agent greeting" + assert properties["mllm"]["failure_message"] == "agent failure" + assert "max_history" not in properties["mllm"] + + +def test_session_start_properties_preserves_mllm_vendor_defaults(): + agent = ( + Agent(name="test") + .with_mllm( + OpenAIRealtime( + api_key="mllm-key", + greeting_message="vendor greeting", + failure_message="vendor failure", + ) + ) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["mllm"]["greeting_message"] == "vendor greeting" + assert properties["mllm"]["failure_message"] == "vendor failure" + assert "max_history" not in properties["mllm"] + + +def test_session_start_allows_mllm_without_tts(): + agent = Agent(name="test").with_mllm(OpenAIRealtime(api_key="mllm-key")) + session = _session(agent) + + assert session.start() == "agent-1" + + +def test_session_start_rejects_mllm_with_enabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + session = _session(agent) + + with pytest.raises(ValueError, match="cascading"): + session.start() + assert session._client.agents.start_calls == [] # noqa: SLF001 + + +def test_session_start_allows_mllm_with_disabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + enable=False, + ) + ) + ) + session = _session(agent) + + assert session.start() == "agent-1" + + +def test_avatar_sample_rate_validation_uses_serialized_vendor_keys(): + warnings = [] + agent = ( + Agent(name="test") + .with_avatar(LiveAvatarAvatar(api_key="avatar-key", quality="medium", agora_uid="2")) + .with_tts(RimeTTS(key="tts-key", speaker="speaker", sampling_rate=24000)) + ) + session = _session(agent, warn=warnings.append) + + session._validate_avatar_config() # noqa: SLF001 + + assert warnings == [] + + +def test_avatar_user_token_is_not_overwritten(): + agent = Agent(name="test").with_avatar( + LiveAvatarAvatar( + api_key="live-key", + quality="medium", + agora_uid="2", + agora_token="user-token", + ) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["avatar"]["params"]["agora_token"] == "user-token" + + +def test_get_turns_forwards_pagination_args(): + session = _session(Agent(name="test")) + session._agent_id = "agent-id" # noqa: SLF001 + + session.get_turns(page_index=3, page_size=25) + + assert session._client.agents.calls[-1][:4] == (APP_ID, "agent-id", 3, 25) # noqa: SLF001 + + +def test_get_all_turns_aggregates_pages(): + session = _session(Agent(name="test")) + session._agent_id = "agent-id" # noqa: SLF001 + + response = session.get_all_turns(page_size=1) + + assert [turn.turn_id for turn in response.turns] == [1.0, 2.0] + assert response.pagination.page_index == 2 + + +def test_get_all_turns_raises_when_pagination_does_not_advance(): + class _StuckAgents: + def __init__(self): + self.calls = 0 + + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + self.calls += 1 + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=2, + pagination={"page_index": 1, "is_last_page": False}, + turns=[{"turn_id": 1.0}], + ) + + class _StuckClient: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _StuckAgents() + self.agent_management = object() + + session = AgentSession( + client=_StuckClient(), + agent=Agent(name="test"), + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + ) + session._agent_id = "agent-id" # noqa: SLF001 + + with pytest.raises(RuntimeError, match="did not advance"): + session.get_all_turns(page_size=1) + + +def test_get_all_turns_raises_when_pagination_metadata_missing(): + class _NoMetaAgents: + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=1, + pagination={"is_last_page": False}, + turns=[{"turn_id": 1.0}], + ) + + class _NoMetaClient: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _NoMetaAgents() + self.agent_management = object() + + session = AgentSession( + client=_NoMetaClient(), + agent=Agent(name="test"), + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + ) + session._agent_id = "agent-id" # noqa: SLF001 + + with pytest.raises(RuntimeError, match="cannot continue"): + session.get_all_turns(page_size=1) diff --git a/tests/custom/test_agentkit_vendors.py b/tests/custom/test_agentkit_vendors.py new file mode 100644 index 0000000..8473821 --- /dev/null +++ b/tests/custom/test_agentkit_vendors.py @@ -0,0 +1,122 @@ +import pytest +from pydantic import ValidationError + +from agora_agent.agentkit import LlmGreetingConfigs +import warnings + +from agora_agent.agentkit.vendors import GenericAvatar, OpenAI, OpenAIRealtime, XaiGrok, XaiRealtime + + +def test_xai_grok_serializes_v27_shape_without_style(): + config = XaiGrok( + api_key="xai-key", + voice="eve", + language="en", + sample_rate=24000, + output_modalities=["audio", "text"], + params={"temperature": 0.2}, + ).to_config() + + assert config["vendor"] == "xai" + assert config["url"] == "wss://api.x.ai/v1/realtime" + assert config["api_key"] == "xai-key" + assert config["params"] == { + "temperature": 0.2, + "voice": "eve", + "language": "en", + "sample_rate": 24000, + } + assert config["output_modalities"] == ["audio", "text"] + assert "style" not in config + + +def test_xai_grok_emits_params_even_when_empty(): + assert XaiGrok(api_key="xai-key").to_config()["params"] == {} + + +def test_xai_realtime_deprecated_alias_emits_same_vendor(): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always", DeprecationWarning) + config = XaiRealtime(api_key="xai-key").to_config() + assert len(caught) == 1 + assert issubclass(caught[0].category, DeprecationWarning) + assert config["vendor"] == "xai" + + +def test_mllm_rejects_fields_not_in_core_contract(): + with pytest.raises(ValidationError): + OpenAIRealtime(api_key="openai-key", predefined_tools=["_publish_message"]) + + with pytest.raises(ValidationError): + XaiGrok(api_key="xai-key", max_history=10) + + +def test_generic_avatar_omits_session_enriched_fields_when_unset(): + config = GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + ).to_config() + + assert config == { + "enable": True, + "vendor": "generic", + "params": { + "api_key": "avatar-key", + "api_base_url": "https://avatar.example.com", + "avatar_id": "avatar-1", + "agora_uid": "2", + }, + } + + +def test_vertex_ai_explicit_fields_override_additional_params(): + from agora_agent.agentkit.vendors import VertexAI + + config = VertexAI( + model="explicit-model", + project_id="explicit-project", + location="explicit-region", + adc_credentials_string="{}", + additional_params={ + "model": "should-be-overridden", + "project_id": "should-be-overridden", + "location": "should-be-overridden", + "adc_credentials_string": "should-be-overridden", + "extra_key": "kept", + }, + ).to_config() + + assert config["vendor"] == "vertexai" + assert config["params"]["model"] == "explicit-model" + assert config["params"]["project_id"] == "explicit-project" + assert config["params"]["location"] == "explicit-region" + assert config["params"]["adc_credentials_string"] == "{}" + assert config["params"]["extra_key"] == "kept" + + +def test_gemini_live_explicit_fields_override_additional_params(): + from agora_agent.agentkit.vendors import GeminiLive + + config = GeminiLive( + api_key="key", + model="explicit-model", + additional_params={ + "model": "should-be-overridden", + "extra_key": "kept", + }, + ).to_config() + + assert config["params"]["model"] == "explicit-model" + assert config["params"]["extra_key"] == "kept" + + +def test_llm_greeting_configs_interruptable_serializes(): + config = OpenAI( + api_key="openai-key", + greeting_configs=LlmGreetingConfigs(mode="single_first", interruptable=False), + ).to_config() + + assert config["greeting_configs"]["mode"] == "single_first" + assert config["greeting_configs"]["interruptable"] is False diff --git a/tests/custom/test_avatar_token.py b/tests/custom/test_avatar_token.py new file mode 100644 index 0000000..fa73fc0 --- /dev/null +++ b/tests/custom/test_avatar_token.py @@ -0,0 +1,12 @@ +from agora_agent.agentkit import generate_convo_ai_token + + +def test_avatar_tokens_use_convo_ai_token_path_with_avatar_uid(): + token = generate_convo_ai_token( + app_id="0" * 32, + app_certificate="1" * 32, + channel_name="room", + uid=123, + ) + + assert token.startswith("007") diff --git a/tests/custom/test_llm_vendors.py b/tests/custom/test_llm_vendors.py new file mode 100644 index 0000000..faca9bf --- /dev/null +++ b/tests/custom/test_llm_vendors.py @@ -0,0 +1,60 @@ +from agora_agent import AmazonBedrock, CustomLLM, Dify, Groq, VertexAILLM + + +def test_groq_serializes_as_openai_compatible() -> None: + config = Groq(api_key="groq-key", model="llama-3.3-70b-versatile").to_config() + + assert config["url"] == "https://api.groq.com/openai/v1/chat/completions" + assert config["api_key"] == "groq-key" + assert config["style"] == "openai" + assert config["params"]["model"] == "llama-3.3-70b-versatile" + + +def test_custom_llm_marks_request_as_custom() -> None: + config = CustomLLM(api_key="key", model="model", base_url="https://llm.example.com/chat").to_config() + + assert config["url"] == "https://llm.example.com/chat" + assert config["api_key"] == "key" + assert config["vendor"] == "custom" + assert config["style"] == "openai" + + +def test_vertex_ai_llm_includes_project_routing() -> None: + config = VertexAILLM( + api_key="vertex-token", + model="gemini-2.0-flash", + project_id="project", + location="us-central1", + ).to_config() + + assert config["api_key"] == "vertex-token" + assert config["style"] == "gemini" + assert config["params"]["model"] == "gemini-2.0-flash" + assert config["params"]["project_id"] == "project" + assert config["params"]["location"] == "us-central1" + + +def test_amazon_bedrock_serializes_as_anthropic_style() -> None: + config = AmazonBedrock( + api_key="bedrock-key", + url="https://bedrock.example.com/messages", + model="anthropic.claude-3-5-sonnet-20241022-v2:0", + ).to_config() + + assert config["api_key"] == "bedrock-key" + assert config["style"] == "anthropic" + assert config["params"]["model"] == "anthropic.claude-3-5-sonnet-20241022-v2:0" + + +def test_dify_serializes_conversation_fields() -> None: + config = Dify( + api_key="dify-key", + url="https://api.dify.ai/v1/chat-messages", + user="user-1", + conversation_id="conversation-1", + ).to_config() + + assert config["api_key"] == "dify-key" + assert config["style"] == "dify" + assert config["params"]["user"] == "user-1" + assert config["params"]["conversation_id"] == "conversation-1" diff --git a/tests/custom/test_root_exports.py b/tests/custom/test_root_exports.py new file mode 100644 index 0000000..9b2f508 --- /dev/null +++ b/tests/custom/test_root_exports.py @@ -0,0 +1,29 @@ +import pytest + +import agora_agent +import agora_agent.agentkit as agentkit + + +def test_root_exports_match_agentkit_for_common_symbols() -> None: + for name in ("Agent", "DeepgramSTT", "OpenAI", "AgentPresets", "generate_rtc_token", "DataChannel"): + assert getattr(agora_agent, name) is getattr(agentkit, name) + + +def test_root_exports_fern_client_symbols() -> None: + assert agora_agent.Agora is not None + assert agora_agent.Area is not None + assert agora_agent.AsyncAgora is not None + + +def test_unknown_root_export_raises_attribute_error() -> None: + with pytest.raises(AttributeError): + _ = agora_agent.NotARealExportName + + +def test_dir_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in dir(agora_agent) + + +def test_all_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in agora_agent.__all__ + assert "OpenAI" in agora_agent.__all__ From cb9ab8b86b57176086dfd86cf35d4fcfd311cc87 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Tue, 2 Jun 2026 03:14:56 -0400 Subject: [PATCH 20/26] docs(agentkit): align OpenAI TTS instructions support --- changelog.md | 2 +- docs/reference/vendors.md | 1 + tests/custom/test_tts_vendors.py | 3 ++- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/changelog.md b/changelog.md index efcc782..f60d9b8 100644 --- a/changelog.md +++ b/changelog.md @@ -107,7 +107,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). ### Added -- **`OpenAITTS`** — New optional parameters: `response_format` (str, e.g. `"pcm"`) and `speed` (float). +- **`OpenAITTS`** — New optional parameters: `instructions` (str) and `speed` (float). - **`CartesiaTTS`** — `voice_id` user-facing field is preserved; voice is serialized to the required nested object format automatically. - **`RimeTTS`** — New optional parameters: `lang` (str), `sampling_rate` (int, serialized as `samplingRate`), `speed_alpha` (float, serialized as `speedAlpha`). - **`OpenAIRealtime`** — New optional parameter: `failure_message` (str). diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index 2a39993..d54aada 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -190,6 +190,7 @@ The SDK also includes named helpers for the remaining Agora-supported LLM provid | `voice` | `str` | Yes | — | Voice: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer` | | `model` | `str` | BYOK only | `None` | Model: `tts-1` or `tts-1-hd` | | `base_url` | `str` | BYOK only | `None` | OpenAI TTS endpoint URL | +| `instructions` | `str` | No | `None` | Custom instructions for voice style, accent, pace, and tone | | `speed` | `float` | No | `None` | Speech speed multiplier | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | diff --git a/tests/custom/test_tts_vendors.py b/tests/custom/test_tts_vendors.py index 2936872..9499eca 100644 --- a/tests/custom/test_tts_vendors.py +++ b/tests/custom/test_tts_vendors.py @@ -52,11 +52,12 @@ def test_tts_vendor_params_match_generated_core_shapes() -> None: "encoding": "linear16", } - assert OpenAITTS(api_key="openai-key", voice="coral", model="gpt-4o-mini-tts", base_url="https://api.openai.com/v1").to_config()["params"] == { + assert OpenAITTS(api_key="openai-key", voice="coral", model="gpt-4o-mini-tts", base_url="https://api.openai.com/v1", instructions="speak clearly").to_config()["params"] == { "voice": "coral", "api_key": "openai-key", "base_url": "https://api.openai.com/v1", "model": "gpt-4o-mini-tts", + "instructions": "speak clearly", } assert OpenAITTS(voice="coral").to_config()["params"] == { From c902235476f218fedd3f23856d04edda1df26a44 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Tue, 2 Jun 2026 03:18:56 -0400 Subject: [PATCH 21/26] docs(agentkit): align TTS provider reference fields --- docs/reference/vendors.md | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index d54aada..f06b0a8 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -180,6 +180,8 @@ The SDK also includes named helpers for the remaining Agora-supported LLM provid | `region` | `str` | Yes | — | Azure region (e.g., `eastus`) | | `voice_name` | `str` | Yes | — | Voice name (e.g., `en-US-JennyNeural`) | | `sample_rate` | `int` | No | `None` | Sample rate: 8000, 16000, 24000, or 48000 Hz | +| `speed` | `float` | No | `None` | Speaking rate multiplier | +| `volume` | `float` | No | `None` | Audio volume | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | ### `OpenAITTS` @@ -203,6 +205,8 @@ The SDK also includes named helpers for the remaining Agora-supported LLM provid | `api_key` | `str` | Yes | — | Cartesia API key | | `voice_id` | `str` | Yes | — | Voice ID (serialized as `{"mode": "id", "id": "..."}`) | | `model_id` | `str` | Yes | — | Model ID | +| `base_url` | `str` | No | `None` | WebSocket URL | +| `language` | `str` | No | `None` | Target language | | `sample_rate` | `int` | No | `None` | Sample rate: 8000–48000 Hz | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | @@ -213,6 +217,7 @@ The SDK also includes named helpers for the remaining Agora-supported LLM provid | `key` | `str` | Yes | — | Google Cloud API key | | `voice_name` | `str` | Yes | — | Voice name | | `language_code` | `str` | No | `None` | Language code (e.g., `en-US`) | +| `sample_rate_hertz` | `int` | No | `None` | Sample rate in Hz | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | ### `AmazonTTS` @@ -273,13 +278,15 @@ The SDK also includes named helpers for the remaining Agora-supported LLM provid | Parameter | Type | Required | Default | Description | |---|---|---|---|---| -| `key` | `str` | Yes | — | MiniMax API key | -| `group_id` | `str` | Yes | — | MiniMax group ID | +| `key` | `str` | BYOK only | `None` | MiniMax API key. Optional for supported Agora-managed MiniMax models | +| `group_id` | `str` | BYOK only | `None` | MiniMax group ID | | `model` | `str` | Yes | — | Model name (e.g., `speech-02-turbo`) | -| `voice_id` | `str` | Yes | — | Voice style identifier | -| `url` | `str` | Yes | — | WebSocket endpoint | +| `voice_id` | `str` | BYOK only | `None` | Voice style identifier | +| `url` | `str` | BYOK only | `None` | WebSocket endpoint | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | +`key`, `group_id`, `voice_id`, and `url` are required together for BYOK. Without `key`, `model` must be one of the supported Agora-managed MiniMax models. + ### `MurfTTS` | Parameter | Type | Required | Default | Description | @@ -301,6 +308,10 @@ The SDK also includes named helpers for the remaining Agora-supported LLM provid | `key` | `str` | Yes | — | Sarvam API key | | `speaker` | `str` | Yes | — | Speaker name | | `target_language_code` | `str` | Yes | — | Target language code | +| `pitch` | `float` | No | `None` | Pitch adjustment | +| `pace` | `float` | No | `None` | Speed of speech | +| `loudness` | `float` | No | `None` | Volume level | +| `sample_rate` | `int` | No | `None` | Audio sample rate | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | --- From 420547b24367ac2301e11c0159aa9484b8647191 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Tue, 2 Jun 2026 03:25:13 -0400 Subject: [PATCH 22/26] docs: add v2.1.0 changelog --- changelog.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/changelog.md b/changelog.md index f60d9b8..bd13913 100644 --- a/changelog.md +++ b/changelog.md @@ -4,6 +4,25 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/). +## [v2.1.0] — 2026-06-02 + +### Added + +- **ASR interaction language** — AgentKit now manages Agora `asr.language` through `interaction_language` / `Agent.with_interaction_language()`, validates it against the supported BCP-47 interaction language list, and sends the default `en-US` when no language is provided. +- **Provider parameter parity** — ASR, LLM, MLLM, TTS, and avatar wrappers expose typed provider parameters plus passthrough fields where the generated core supports additional properties. + +### Changed + +- **Generated core refresh** — Regenerated core types from the v2.1 API schema. +- **Deepgram TTS passthrough** — `DeepgramTTS` now uses `additional_params` for passthrough fields and flattens them into `tts.params`; the removed nested `params.params` shape is no longer documented or emitted. +- **OpenAI TTS** — Docs and tests now reflect the generated core shape, including `instructions` and `speed` under `tts.params`. +- **TTS provider docs** — Updated TTS provider reference tables to match implemented wrapper fields and generated core params. + +### Fixed + +- **Managed-provider validation** — AgentKit validation now distinguishes preset-backed providers from BYOK providers so required provider fields are only required when credentials are caller-supplied. +- **ASR language separation** — Provider-specific STT language values remain under `asr.params`, while Agora interaction language is emitted separately as `asr.language`. + ## [v2.0.0] — 2026-05-21 ### Added From 299e4bd9cb59bd6144084332a7c3fa7bf260769f Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Tue, 2 Jun 2026 03:34:29 -0400 Subject: [PATCH 23/26] fix(agentkit): resolve provider config type checks --- src/agora_agent/agentkit/agent.py | 36 +++++++++++++++++++++- src/agora_agent/agentkit/vendors/llm.py | 20 ++++++++++--- src/agora_agent/agentkit/vendors/mllm.py | 2 +- src/agora_agent/agentkit/vendors/stt.py | 38 ++++++++++++++++++++++-- 4 files changed, 88 insertions(+), 8 deletions(-) diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index 0ab0f5b..61db721 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -246,7 +246,41 @@ class SessionOptions(typing_extensions.TypedDict, total=False): ] DEFAULT_INTERACTION_LANGUAGE: InteractionLanguage = "en-US" -_INTERACTION_LANGUAGES = set(InteractionLanguage.__args__) +INTERACTION_LANGUAGE_VALUES: typing.Tuple[InteractionLanguage, ...] = ( + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", +) +_INTERACTION_LANGUAGES = set(INTERACTION_LANGUAGE_VALUES) def _dump_optional_model(value: typing.Any) -> typing.Any: diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py index ba3aa3e..9156a01 100644 --- a/src/agora_agent/agentkit/vendors/llm.py +++ b/src/agora_agent/agentkit/vendors/llm.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional from pydantic import BaseModel, ConfigDict, Field, model_validator @@ -384,7 +384,7 @@ def to_config(self) -> Dict[str, Any]: return config -class AmazonBedrockOptions(AnthropicOptions): +class AmazonBedrockOptions(BaseModel): model_config = ConfigDict(extra="forbid") access_key: str = Field(..., description="AWS access key ID") @@ -392,9 +392,21 @@ class AmazonBedrockOptions(AnthropicOptions): region: str = Field(..., description="AWS region") model: str = Field(..., description="Amazon Bedrock model identifier") max_tokens: Optional[int] = Field(default=None, gt=0) - api_key: Optional[str] = Field(default=None, description="Unused; kept for AnthropicOptions compatibility") - url: Optional[str] = Field(default=None, description="Unused; kept for AnthropicOptions compatibility") + url: Optional[str] = Field(default=None, description="Amazon Bedrock converse stream endpoint URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") class AmazonBedrock(BaseLLM): diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py index e5fcb5b..236a494 100644 --- a/src/agora_agent/agentkit/vendors/mllm.py +++ b/src/agora_agent/agentkit/vendors/mllm.py @@ -44,7 +44,7 @@ def to_config(self) -> Dict[str, Any]: or self.options.instructions is not None or self.options.input_audio_transcription is not None ): - params = {} + params: Dict[str, Any] = {} if self.options.model is not None: params["model"] = self.options.model if self.options.params is not None: diff --git a/src/agora_agent/agentkit/vendors/stt.py b/src/agora_agent/agentkit/vendors/stt.py index 47d94f1..68c7f1d 100644 --- a/src/agora_agent/agentkit/vendors/stt.py +++ b/src/agora_agent/agentkit/vendors/stt.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, Tuple from pydantic import BaseModel, ConfigDict, Field, model_validator from typing_extensions import Literal @@ -40,7 +40,41 @@ "vi-VN", ] -_INTERACTION_LANGUAGES = set(InteractionLanguage.__args__) +INTERACTION_LANGUAGE_VALUES: Tuple[InteractionLanguage, ...] = ( + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", +) +_INTERACTION_LANGUAGES = set(INTERACTION_LANGUAGE_VALUES) _DEEPGRAM_MANAGED_MODELS = {"nova-2", "nova-3"} From 583eccc00eb0b1d47b75df202a8885adf514573b Mon Sep 17 00:00:00 2001 From: "Hermes (agora)" Date: Tue, 2 Jun 2026 14:04:56 -0400 Subject: [PATCH 24/26] Move AgentKit language to turn detection Move the Agora interaction language setting from the unlaunched top-level AgentKit API into turn_detection.language across TypeScript, Python, and Go. Remove the top-level interaction language helpers and STT vendor override fields, keep provider-specific STT language under asr.params, and default turn_detection.language to en-US when omitted. Update tests, READMEs, docs, and changelogs to reflect the final v2.1.0 API surface. --- README.md | 6 +- changelog.md | 4 +- docs/concepts/agent.md | 2 +- docs/concepts/vendors.md | 2 +- docs/reference/agent.md | 6 +- docs/reference/vendors.md | 9 +-- src/agora_agent/agentkit/__init__.py | 4 +- src/agora_agent/agentkit/agent.py | 64 ++++++++--------- src/agora_agent/agentkit/vendors/stt.py | 70 ++++++++----------- ...gents_request_properties_turn_detection.py | 6 ++ tests/custom/test_stt_language.py | 34 ++++----- 11 files changed, 96 insertions(+), 111 deletions(-) diff --git a/README.md b/README.md index 1fafba5..269cc61 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ pip install agora-agents ## Quick Start Start with the `Agent` builder: create a client with app credentials, choose your ASR, LLM, and TTS providers, then start a session. Omit vendor API keys for supported Agora-managed models, or provide keys when you want BYOK. -Use `with_interaction_language()` for Agora `asr.language`; provider-specific STT language values remain under `asr.params`. +Set Agora interaction language with `turn_detection.language`; provider-specific STT language values remain under `asr.params`. ```python import os @@ -54,7 +54,7 @@ def start_conversation() -> str: app_certificate=app_certificate, ) - agent = Agent(name=f"conversation-{int(time.time())}").with_interaction_language("en-US").with_stt( + agent = Agent(name=f"conversation-{int(time.time())}", turn_detection={"language": "en-US"}).with_stt( DeepgramSTT( model="nova-3", language="en", @@ -101,7 +101,7 @@ def start_conversation() -> str: Use the same `Agent` builder shape, but provide credentials explicitly when you want vendor-managed billing and routing instead of Agora-managed models. ```python -agent = Agent().with_interaction_language("en-US").with_stt( +agent = Agent(turn_detection={"language": "en-US"}).with_stt( DeepgramSTT( api_key=os.environ["DEEPGRAM_API_KEY"], model="nova-3", diff --git a/changelog.md b/changelog.md index bd13913..dc8dcc6 100644 --- a/changelog.md +++ b/changelog.md @@ -8,7 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). ### Added -- **ASR interaction language** — AgentKit now manages Agora `asr.language` through `interaction_language` / `Agent.with_interaction_language()`, validates it against the supported BCP-47 interaction language list, and sends the default `en-US` when no language is provided. +- **Turn detection language** — AgentKit now manages Agora interaction language through `turn_detection.language`, validates it against the supported BCP-47 language list, and sends the default `en-US` when no language is provided. - **Provider parameter parity** — ASR, LLM, MLLM, TTS, and avatar wrappers expose typed provider parameters plus passthrough fields where the generated core supports additional properties. ### Changed @@ -21,7 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). ### Fixed - **Managed-provider validation** — AgentKit validation now distinguishes preset-backed providers from BYOK providers so required provider fields are only required when credentials are caller-supplied. -- **ASR language separation** — Provider-specific STT language values remain under `asr.params`, while Agora interaction language is emitted separately as `asr.language`. +- **Language placement** — Provider-specific STT language values remain under `asr.params`, while Agora interaction language is emitted separately as `turn_detection.language`. ## [v2.0.0] — 2026-05-21 diff --git a/docs/concepts/agent.md b/docs/concepts/agent.md index 1122c59..8a75762 100644 --- a/docs/concepts/agent.md +++ b/docs/concepts/agent.md @@ -64,7 +64,7 @@ Each `with_*` method returns a **new** `Agent` instance — the original is unch | `with_instructions(text)` | `str` | Deprecated. Use LLM vendor `system_messages` instead. | | `with_greeting(text)` | `str` | Deprecated. Use LLM/MLLM vendor `greeting_message` instead. | | `with_name(name)` | `str` | Override the agent name | -| `with_turn_detection(config)` | `TurnDetectionConfig` | Override cascading-flow SOS/EOS detection; use `with_interruption()` for interruption behavior | +| `with_turn_detection(config)` | `TurnDetectionConfig` | Configure `turn_detection.language` and cascading-flow SOS/EOS detection; use `with_interruption()` for interruption behavior | | `with_sal(config)` | `SalConfig` | Set SAL configuration | | `with_advanced_features(features)` | `Dict[str, Any]` | Set advanced features | | `with_parameters(parameters)` | `SessionParams` | Set session parameters | diff --git a/docs/concepts/vendors.md b/docs/concepts/vendors.md index 5c49e23..c59ae7c 100644 --- a/docs/concepts/vendors.md +++ b/docs/concepts/vendors.md @@ -75,7 +75,7 @@ tts = ElevenLabsTTS( Used with `agent.with_stt()`. -Use `agent.with_interaction_language()` for Agora `asr.language`; it defaults to `en-US`. STT vendor `language` options are serialized under `asr.params` using each provider's own format. +Use `turn_detection.language` for Agora interaction language; it defaults to `en-US`. STT vendor `language` options are serialized under `asr.params` using each provider's own format. | Class | Provider | Required Parameters | |---|---|---| diff --git a/docs/reference/agent.md b/docs/reference/agent.md index 9094ba5..187229f 100644 --- a/docs/reference/agent.md +++ b/docs/reference/agent.md @@ -34,7 +34,7 @@ Agent( |---|---|---|---| | `name` | `Optional[str]` | `None` | Agent name, used as default session name | | `instructions` | `Optional[str]` | `None` | Deprecated. Use LLM vendor `system_messages` instead. | -| `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Turn detection configuration | +| `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Interaction language and turn detection configuration | | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | | `sal` | `Optional[SalConfig]` | `None` | Speech Activity Level configuration | | `advanced_features` | `Optional[Dict[str, Any]]` | `None` | Advanced features dict (e.g., `{'enable_rtm': True}`) | @@ -109,7 +109,7 @@ agent = agent.with_avatar(HeyGenAvatar(api_key='your-key', quality='medium', ago ### `with_turn_detection(config: TurnDetectionConfig) -> Agent` -Override cascading-flow turn detection settings. Use `config.start_of_speech` and `config.end_of_speech` for SOS/EOS detection. Use `with_interruption()` for interruption behavior and MLLM vendor `turn_detection` for MLLM turn detection. +Override cascading-flow turn detection settings. Use `language` for the Agora interaction language, `config.start_of_speech` and `config.end_of_speech` for SOS/EOS detection, `with_interruption()` for interruption behavior, and MLLM vendor `turn_detection` for MLLM turn detection. Pause-state detection is configured under semantic end-of-speech: @@ -257,7 +257,7 @@ to_properties( | `stt` | `Optional[Dict[str, Any]]` | STT config dict | | `mllm` | `Optional[Dict[str, Any]]` | MLLM config dict | | `avatar` | `Optional[Dict[str, Any]]` | Avatar config dict | -| `turn_detection` | `Optional[TurnDetectionConfig]` | Turn detection settings | +| `turn_detection` | `Optional[TurnDetectionConfig]` | Interaction language and turn detection settings | | `sal` | `Optional[SalConfig]` | SAL configuration | | `advanced_features` | `Optional[Dict[str, Any]]` | Advanced features | | `parameters` | `Optional[SessionParams]` | Session parameters | diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index f06b0a8..cfa8580 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -318,7 +318,7 @@ The SDK also includes named helpers for the remaining Agora-supported LLM provid ## STT Vendors -Use `agent.with_interaction_language()` for Agora `asr.language`; it defaults to `en-US`. Provider-specific language values remain under `asr.params` and may use a different format. +Use `turn_detection.language` for Agora interaction language; it defaults to `en-US`. Provider-specific language values remain under `asr.params` and may use a different format. ### `SpeechmaticsSTT` @@ -326,7 +326,6 @@ Use `agent.with_interaction_language()` for Agora `asr.language`; it defaults to |---|---|---|---|---| | `api_key` | `str` | Yes | — | Speechmatics API key | | `language` | `str` | Yes | — | Language code (e.g., `en`) | -| `interaction_language` | `str` | No | `None` | Agora `asr.language` override | | `uri` | `str` | No | `None` | Speechmatics streaming WebSocket URL | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | @@ -337,7 +336,6 @@ Use `agent.with_interaction_language()` for Agora `asr.language`; it defaults to | `api_key` | `str` | BYOK only | `None` | Deepgram API key. Optional only for Agora-managed `nova-2` and `nova-3`. | | `model` | `str` | No | `None` | Model (e.g., `nova-2`) | | `language` | `str` | No | `None` | Language code (e.g., `en-US`) | -| `interaction_language` | `str` | No | `None` | Agora `asr.language` override | | `smart_format` | `bool` | No | `None` | Enable smart formatting | | `punctuation` | `bool` | No | `None` | Enable punctuation | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | @@ -351,7 +349,6 @@ For `nova-2` and `nova-3`, omit `api_key` to use Agora-managed credentials. For | `key` | `str` | Yes | — | Azure subscription key | | `region` | `str` | Yes | — | Azure region (e.g., `eastus`) | | `language` | `str` | Yes | — | Language code (e.g., `en-US`) | -| `interaction_language` | `str` | No | `None` | Agora `asr.language` override | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | ### `OpenAISTT` @@ -363,7 +360,6 @@ For `nova-2` and `nova-3`, omit `api_key` to use Agora-managed credentials. For | `language` | `str` | No | `None` | Language code | | `prompt` | `str` | No | `None` | Prompt for OpenAI transcription | | `input_audio_transcription` | `Dict[str, Any]` | No | `None` | OpenAI transcription settings | -| `interaction_language` | `str` | No | `None` | Agora `asr.language` override | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | ### `GoogleSTT` @@ -374,7 +370,6 @@ For `nova-2` and `nova-3`, omit `api_key` to use Agora-managed credentials. For | `location` | `str` | Yes | — | Google Cloud region | | `adc_credentials_string` | `str` | Yes | — | Google service account credentials JSON string | | `language` | `str` | Yes | — | Language code (e.g., `en-US`) | -| `interaction_language` | `str` | No | `None` | Agora `asr.language` override | | `model` | `str` | No | `None` | Recognition model | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | @@ -386,7 +381,6 @@ For `nova-2` and `nova-3`, omit `api_key` to use Agora-managed credentials. For | `secret_key` | `str` | Yes | — | AWS Secret Access Key | | `region` | `str` | Yes | — | AWS region (e.g., `us-east-1`) | | `language` | `str` | Yes | — | Amazon `language_code` | -| `interaction_language` | `str` | No | `None` | Agora `asr.language` override | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | ### `AssemblyAISTT` @@ -395,7 +389,6 @@ For `nova-2` and `nova-3`, omit `api_key` to use Agora-managed credentials. For |---|---|---|---|---| | `api_key` | `str` | Yes | — | AssemblyAI API key | | `language` | `str` | Yes | — | Language code | -| `interaction_language` | `str` | No | `None` | Agora `asr.language` override | | `uri` | `str` | No | `None` | AssemblyAI streaming WebSocket URL | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | diff --git a/src/agora_agent/agentkit/__init__.py b/src/agora_agent/agentkit/__init__.py index ff20d29..8a8fdf2 100644 --- a/src/agora_agent/agentkit/__init__.py +++ b/src/agora_agent/agentkit/__init__.py @@ -3,7 +3,7 @@ AgentConfig, AgentConfigUpdate, AsrConfig, - InteractionLanguage, + TurnDetectionLanguage, ConversationHistory, ConversationRole, ConversationSessionTurn, @@ -205,7 +205,7 @@ "LlmStyle", "SttConfig", "AsrConfig", - "InteractionLanguage", + "TurnDetectionLanguage", "SttVendor", "TtsConfig", "MllmConfig", diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index 61db721..fea1f0d 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -210,7 +210,7 @@ class SessionOptions(typing_extensions.TypedDict, total=False): from .token import generate_convo_ai_token, _parse_numeric_uid, _validate_expires_in -InteractionLanguage = typing_extensions.Literal[ +TurnDetectionLanguage = typing_extensions.Literal[ "ar-EG", "ar-JO", "ar-SA", @@ -245,8 +245,8 @@ class SessionOptions(typing_extensions.TypedDict, total=False): "vi-VN", ] -DEFAULT_INTERACTION_LANGUAGE: InteractionLanguage = "en-US" -INTERACTION_LANGUAGE_VALUES: typing.Tuple[InteractionLanguage, ...] = ( +DEFAULT_TURN_DETECTION_LANGUAGE: TurnDetectionLanguage = "en-US" +TURN_DETECTION_LANGUAGE_VALUES: typing.Tuple[TurnDetectionLanguage, ...] = ( "ar-EG", "ar-JO", "ar-SA", @@ -280,7 +280,7 @@ class SessionOptions(typing_extensions.TypedDict, total=False): "tr-TR", "vi-VN", ) -_INTERACTION_LANGUAGES = set(INTERACTION_LANGUAGE_VALUES) +_TURN_DETECTION_LANGUAGES = set(TURN_DETECTION_LANGUAGE_VALUES) def _dump_optional_model(value: typing.Any) -> typing.Any: @@ -291,12 +291,12 @@ def _dump_optional_model(value: typing.Any) -> typing.Any: return value -def _is_interaction_language(value: typing.Any) -> bool: - return isinstance(value, str) and value in _INTERACTION_LANGUAGES +def _is_turn_detection_language(value: typing.Any) -> bool: + return isinstance(value, str) and value in _TURN_DETECTION_LANGUAGES -def _validate_interaction_language(value: typing.Any) -> InteractionLanguage: - if not _is_interaction_language(value): +def _validate_turn_detection_language(value: typing.Any) -> TurnDetectionLanguage: + if not _is_turn_detection_language(value): raise ValueError(f"Invalid interaction language: {value}") return value # type: ignore[return-value] @@ -335,7 +335,6 @@ def __init__( sal: typing.Optional[SalConfig] = None, advanced_features: typing.Optional[AdvancedFeatures] = None, parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, - interaction_language: typing.Optional[InteractionLanguage] = None, greeting: typing.Optional[str] = None, failure_message: typing.Optional[str] = None, max_history: typing.Optional[int] = None, @@ -362,11 +361,6 @@ def __init__( self._sal = sal self._advanced_features = advanced_features self._parameters = parameters - self._interaction_language = ( - _validate_interaction_language(interaction_language) - if interaction_language is not None - else None - ) self._geofence = geofence self._labels = labels self._rtc = rtc @@ -400,16 +394,6 @@ def with_stt(self, vendor: BaseSTT) -> "Agent": new_agent._stt = vendor.to_config() return new_agent - def with_interaction_language(self, language: InteractionLanguage) -> "Agent": - """Returns a new Agent with the Agora interaction language. - - This serializes to ``asr.language``. Vendor-specific language values - remain under ``asr.params``, for example ``asr.params.language``. - """ - new_agent = self._clone() - new_agent._interaction_language = _validate_interaction_language(language) - return new_agent - def with_mllm(self, vendor: BaseMLLM) -> "Agent": # Note: avatars are not supported with MLLM. The combination is rejected # at ``to_properties`` / ``AgentSession.start`` so callers can still @@ -705,10 +689,6 @@ def rtc(self) -> typing.Optional[RtcConfig]: def filler_words(self) -> typing.Optional[FillerWordsConfig]: return self._filler_words - @property - def interaction_language(self) -> typing.Optional[InteractionLanguage]: - return self._interaction_language - @property def config(self) -> typing.Dict[str, typing.Any]: return { @@ -727,7 +707,6 @@ def config(self) -> typing.Dict[str, typing.Any]: "avatar": self._avatar, "advanced_features": self._advanced_features, "parameters": self._parameters, - "interaction_language": self._interaction_language, "geofence": self._geofence, "labels": self._labels, "rtc": self._rtc, @@ -909,6 +888,7 @@ def to_properties( return StartAgentsRequestProperties(**base_kwargs) base_kwargs["asr"] = self._resolve_asr_config() + base_kwargs["turn_detection"] = self._resolve_turn_detection_config() if skip_vendor_validation: return StartAgentsRequestProperties(**base_kwargs) @@ -940,13 +920,28 @@ def to_properties( def _resolve_asr_config(self) -> typing.Dict[str, typing.Any]: asr_config = dict(self._stt or {}) - existing_language = asr_config.get("language") - language = self._interaction_language - if language is None: - language = existing_language if _is_interaction_language(existing_language) else DEFAULT_INTERACTION_LANGUAGE - asr_config["language"] = language + asr_config.pop("language", None) + if not asr_config: + asr_config["vendor"] = "ares" return asr_config + def _resolve_turn_detection_config(self) -> TurnDetectionConfig: + existing_stt_language = self._stt.get("language") if self._stt is not None else None + existing_turn_detection_language = self._field_value(self._turn_detection, "language") + language = ( + existing_turn_detection_language + if existing_turn_detection_language is not None + else existing_stt_language + if _is_turn_detection_language(existing_stt_language) + else DEFAULT_TURN_DETECTION_LANGUAGE + ) + language = _validate_turn_detection_language(language) + if self._turn_detection is None: + return StartAgentsRequestPropertiesTurnDetection(language=language) + if isinstance(self._turn_detection, dict): + return typing.cast(TurnDetectionConfig, {**self._turn_detection, "language": language}) + return self._copy_model_update(self._turn_detection, {"language": language}) + def _clone(self) -> "Agent": new_agent = Agent.__new__(Agent) new_agent._name = self._name @@ -962,7 +957,6 @@ def _clone(self) -> "Agent": new_agent._sal = self._sal new_agent._advanced_features = self._advanced_features new_agent._parameters = self._parameters - new_agent._interaction_language = self._interaction_language new_agent._instructions = self._instructions new_agent._greeting = self._greeting new_agent._failure_message = self._failure_message diff --git a/src/agora_agent/agentkit/vendors/stt.py b/src/agora_agent/agentkit/vendors/stt.py index 68c7f1d..e5117b0 100644 --- a/src/agora_agent/agentkit/vendors/stt.py +++ b/src/agora_agent/agentkit/vendors/stt.py @@ -5,7 +5,7 @@ from .base import BaseSTT -InteractionLanguage = Literal[ +TurnDetectionLanguage = Literal[ "ar-EG", "ar-JO", "ar-SA", @@ -40,7 +40,7 @@ "vi-VN", ] -INTERACTION_LANGUAGE_VALUES: Tuple[InteractionLanguage, ...] = ( +TURN_DETECTION_LANGUAGE_VALUES: Tuple[TurnDetectionLanguage, ...] = ( "ar-EG", "ar-JO", "ar-SA", @@ -74,14 +74,12 @@ "tr-TR", "vi-VN", ) -_INTERACTION_LANGUAGES = set(INTERACTION_LANGUAGE_VALUES) +_TURN_DETECTION_LANGUAGES = set(TURN_DETECTION_LANGUAGE_VALUES) _DEEPGRAM_MANAGED_MODELS = {"nova-2", "nova-3"} -def _interaction_language(language: Optional[str], interaction_language: Optional[InteractionLanguage]) -> Optional[InteractionLanguage]: - if interaction_language is not None: - return interaction_language - if language in _INTERACTION_LANGUAGES: +def _turn_detection_language(language: Optional[str]) -> Optional[TurnDetectionLanguage]: + if language in _TURN_DETECTION_LANGUAGES: return language # type: ignore[return-value] return None @@ -91,7 +89,6 @@ class SpeechmaticsSTTOptions(BaseModel): api_key: str = Field(..., description="Speechmatics API key") language: str = Field(..., description="Language code (e.g., en, es, fr)") - interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") model: Optional[str] = Field(default=None, description="Model name") uri: Optional[str] = Field(default=None, description="Speechmatics streaming WebSocket URL") additional_params: Optional[Dict[str, Any]] = Field(default=None) @@ -115,9 +112,9 @@ def to_config(self) -> Dict[str, Any]: "vendor": "speechmatics", "params": params, } - interaction_language = _interaction_language(self.options.language, self.options.interaction_language) - if interaction_language is not None: - config["language"] = interaction_language + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language return config @@ -127,7 +124,6 @@ class DeepgramSTTOptions(BaseModel): api_key: Optional[str] = Field(default=None, description="Deepgram API key") model: Optional[str] = Field(default=None, description="Model (e.g., nova-2, enhanced, base)") language: Optional[str] = Field(default=None, description="Language code (e.g., en-US)") - interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") smart_format: Optional[bool] = Field(default=None, description="Enable smart formatting") punctuation: Optional[bool] = Field(default=None, description="Enable punctuation") additional_params: Optional[Dict[str, Any]] = Field(default=None) @@ -159,9 +155,9 @@ def to_config(self) -> Dict[str, Any]: "vendor": "deepgram", "params": params, } - interaction_language = _interaction_language(self.options.language, self.options.interaction_language) - if interaction_language is not None: - config["language"] = interaction_language + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language return config @@ -171,7 +167,6 @@ class MicrosoftSTTOptions(BaseModel): key: str = Field(..., description="Azure subscription key") region: str = Field(..., description="Azure region (e.g., eastus)") language: str = Field(..., description="Language code (e.g., en-US)") - interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") additional_params: Optional[Dict[str, Any]] = Field(default=None) class MicrosoftSTT(BaseSTT): @@ -191,9 +186,9 @@ def to_config(self) -> Dict[str, Any]: "vendor": "microsoft", "params": params, } - interaction_language = _interaction_language(self.options.language, self.options.interaction_language) - if interaction_language is not None: - config["language"] = interaction_language + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language return config @@ -205,7 +200,6 @@ class OpenAISTTOptions(BaseModel): language: Optional[str] = Field(default=None, description="Language code") prompt: Optional[str] = Field(default=None, description="Prompt that guides OpenAI transcription") input_audio_transcription: Optional[Dict[str, Any]] = Field(default=None, description="OpenAI transcription settings") - interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") additional_params: Optional[Dict[str, Any]] = Field(default=None) class OpenAISTT(BaseSTT): @@ -229,9 +223,9 @@ def to_config(self) -> Dict[str, Any]: "vendor": "openai", "params": params, } - interaction_language = _interaction_language(self.options.language, self.options.interaction_language) - if interaction_language is not None: - config["language"] = interaction_language + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language return config @@ -242,7 +236,6 @@ class GoogleSTTOptions(BaseModel): location: str = Field(..., description="Google Cloud region") adc_credentials_string: str = Field(..., description="Google service account credentials JSON string") language: str = Field(..., description="Language code (e.g., en-US)") - interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") model: Optional[str] = Field(default=None, description="Recognition model") additional_params: Optional[Dict[str, Any]] = Field(default=None) @@ -267,9 +260,9 @@ def to_config(self) -> Dict[str, Any]: "vendor": "google", "params": params, } - interaction_language = _interaction_language(self.options.language, self.options.interaction_language) - if interaction_language is not None: - config["language"] = interaction_language + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language return config @@ -280,7 +273,6 @@ class AmazonSTTOptions(BaseModel): secret_key: str = Field(..., description="AWS Secret Access Key") region: str = Field(..., description="AWS region (e.g., us-east-1)") language: str = Field(..., description="Language code") - interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") additional_params: Optional[Dict[str, Any]] = Field(default=None) class AmazonSTT(BaseSTT): @@ -301,9 +293,9 @@ def to_config(self) -> Dict[str, Any]: "vendor": "amazon", "params": params, } - interaction_language = _interaction_language(self.options.language, self.options.interaction_language) - if interaction_language is not None: - config["language"] = interaction_language + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language return config @@ -312,7 +304,6 @@ class AssemblyAISTTOptions(BaseModel): api_key: str = Field(..., description="AssemblyAI API key") language: str = Field(..., description="Language code") - interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") uri: Optional[str] = Field(default=None, description="AssemblyAI streaming WebSocket URL") additional_params: Optional[Dict[str, Any]] = Field(default=None) @@ -332,16 +323,16 @@ def to_config(self) -> Dict[str, Any]: "vendor": "assemblyai", "params": params, } - interaction_language = _interaction_language(self.options.language, self.options.interaction_language) - if interaction_language is not None: - config["language"] = interaction_language + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language return config class AresSTTOptions(BaseModel): model_config = ConfigDict(extra="forbid") - language: Optional[InteractionLanguage] = Field(default=None, description="Language code") + language: Optional[TurnDetectionLanguage] = Field(default=None, description="Language code") additional_params: Optional[Dict[str, Any]] = Field(default=None) class AresSTT(BaseSTT): @@ -362,7 +353,6 @@ class SarvamSTTOptions(BaseModel): api_key: str = Field(..., description="Sarvam API key") language: str = Field(..., description="Language code (e.g., en, hi, ta)") - interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") model: Optional[str] = Field(default=None, description="Model name") additional_params: Optional[Dict[str, Any]] = Field(default=None) @@ -383,7 +373,7 @@ def to_config(self) -> Dict[str, Any]: "vendor": "sarvam", "params": params, } - interaction_language = _interaction_language(self.options.language, self.options.interaction_language) - if interaction_language is not None: - config["language"] = interaction_language + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language return config diff --git a/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py b/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py index 40dbb02..fb58a36 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py @@ -5,6 +5,7 @@ import pydantic from ...core.pydantic_utilities import IS_PYDANTIC_V2 from ...core.unchecked_base_model import UncheckedBaseModel +from ...types.asr_language import AsrLanguage from .start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig from .start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness from .start_agents_request_properties_turn_detection_interrupt_mode import ( @@ -18,6 +19,11 @@ class StartAgentsRequestPropertiesTurnDetection(UncheckedBaseModel): Conversation turn detection settings. Controls the logic for voice activity detection and conversation turn determination. This object has no effect when `mllm.enable` is true; use `mllm.turn_detection` instead. """ + language: typing.Optional[AsrLanguage] = pydantic.Field(default=None) + """ + BCP-47 language tag identifying the primary language used for agent interaction. + """ + mode: typing.Optional[typing.Literal["default"]] = pydantic.Field(default=None) """ Conversation turn detection mode: diff --git a/tests/custom/test_stt_language.py b/tests/custom/test_stt_language.py index 6ebb484..c398e02 100644 --- a/tests/custom/test_stt_language.py +++ b/tests/custom/test_stt_language.py @@ -10,6 +10,7 @@ OpenAI, OpenAISTT, SpeechmaticsSTT, + TurnDetectionConfig, ) @@ -38,46 +39,47 @@ def properties(agent: Agent) -> dict: ) -def test_bcp47_stt_language_sets_asr_language_and_provider_param() -> None: +def test_bcp47_stt_language_sets_turn_detection_language_and_provider_param() -> None: props = properties(base_agent().with_stt(SpeechmaticsSTT(api_key="stt-key", language="en-US"))) assert props["asr"]["vendor"] == "speechmatics" - assert props["asr"]["language"] == "en-US" + assert "language" not in props["asr"] + assert props["turn_detection"]["language"] == "en-US" assert props["asr"]["params"]["language"] == "en-US" -def test_provider_language_defaults_interaction_language_when_not_supported_by_ares() -> None: +def test_provider_language_defaults_turn_detection_language_when_not_supported_by_ares() -> None: props = properties(base_agent().with_stt(SpeechmaticsSTT(api_key="stt-key", language="en"))) assert props["asr"]["vendor"] == "speechmatics" - assert props["asr"]["language"] == "en-US" + assert "language" not in props["asr"] + assert props["turn_detection"]["language"] == "en-US" assert props["asr"]["params"]["language"] == "en" - assert "turn_detection" not in props -def test_explicit_interaction_language_can_differ_from_provider_language() -> None: +def test_turn_detection_language_can_differ_from_provider_language() -> None: props = properties( - base_agent() - .with_interaction_language("fr-FR") + Agent(turn_detection=TurnDetectionConfig(language="fr-FR")) + .with_llm(OpenAI(api_key="llm-key", model="gpt-4o-mini", base_url="https://api.openai.com/v1/chat/completions")) + .with_tts(ElevenLabsTTS(key="tts-key", voice_id="voice", model_id="eleven_flash_v2_5", base_url="wss://api.elevenlabs.io/v1")) .with_stt(SpeechmaticsSTT(api_key="stt-key", language="en")) ) - assert props["asr"]["language"] == "fr-FR" + assert props["turn_detection"]["language"] == "fr-FR" + assert "language" not in props["asr"] assert props["asr"]["params"]["language"] == "en" -def test_invalid_explicit_interaction_language_is_rejected() -> None: +def test_invalid_turn_detection_language_is_rejected() -> None: with pytest.raises(ValueError, match="Invalid interaction language: en"): - Agent(interaction_language="en") # type: ignore[arg-type] - - with pytest.raises(ValueError, match="Invalid interaction language: xx-YY"): - base_agent().with_interaction_language("xx-YY") # type: ignore[arg-type] + properties(Agent(turn_detection=TurnDetectionConfig(language="en"))) # type: ignore[arg-type] -def test_default_interaction_language_is_sent_without_stt() -> None: +def test_default_turn_detection_language_is_sent_without_stt() -> None: props = properties(base_agent()) - assert props["asr"]["language"] == "en-US" + assert props["asr"] == {"vendor": "ares"} + assert props["turn_detection"] == {"language": "en-US"} def test_stt_vendor_params_match_documented_shapes() -> None: From 617ee134d9dafbf4f4f83d5e98b80ad110c6e1bf Mon Sep 17 00:00:00 2001 From: "Hermes (agora)" Date: Tue, 2 Jun 2026 15:26:29 -0400 Subject: [PATCH 25/26] feat(agentkit): support agent-level pipeline_id --- README.md | 32 ++++++ docs/reference/agent.md | 10 ++ docs/reference/session.md | 12 +++ src/agora_agent/agentkit/agent.py | 9 ++ src/agora_agent/agentkit/agent_session.py | 25 +++-- tests/custom/test_pipeline_id.py | 123 ++++++++++++++++++++++ 6 files changed, 202 insertions(+), 9 deletions(-) create mode 100644 tests/custom/test_pipeline_id.py diff --git a/README.md b/README.md index 269cc61..c8cbabf 100644 --- a/README.md +++ b/README.md @@ -96,6 +96,38 @@ def start_conversation() -> str: `Agora` generates the required ConvoAI REST auth and RTC join tokens automatically when you provide `app_id` and `app_certificate`. For supported Agora-managed models, leave vendor API keys unset; provide keys when you want BYOK. +## AI Studio pipeline IDs + +Use `pipeline_id` when you want a published AI Studio pipeline to provide the base agent configuration: + +```python +agent = Agent( + name="support", + pipeline_id="studio-pipeline-id", +) + +session = agent.create_session( + client, + channel="support-room", + agent_uid="1", + remote_uids=["100"], +) +``` + +You can override it per session: + +```python +session = agent.create_session( + client, + channel="support-room", + agent_uid="1", + remote_uids=["100"], + pipeline_id="session-pipeline-id", +) +``` + +AgentKit sends the resolved value as the top-level `/join` field `pipeline_id`, not inside `properties`. Explicit Agent config such as `with_llm()`, `with_tts()`, `with_stt()`, `with_mllm()`, and `advanced_features` may send `properties` fields that override the saved pipeline settings. + ### BYOK version Use the same `Agent` builder shape, but provide credentials explicitly when you want vendor-managed billing and routing instead of Agora-managed models. diff --git a/docs/reference/agent.md b/docs/reference/agent.md index 187229f..86d4fbd 100644 --- a/docs/reference/agent.md +++ b/docs/reference/agent.md @@ -27,12 +27,14 @@ Agent( labels: Optional[Dict[str, str]] = None, rtc: Optional[RtcConfig] = None, filler_words: Optional[FillerWordsConfig] = None, + pipeline_id: Optional[str] = None, ) ``` | Parameter | Type | Default | Description | |---|---|---|---| | `name` | `Optional[str]` | `None` | Agent name, used as default session name | +| `pipeline_id` | `Optional[str]` | `None` | Published AI Studio pipeline ID used as this agent's base configuration | | `instructions` | `Optional[str]` | `None` | Deprecated. Use LLM vendor `system_messages` instead. | | `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Interaction language and turn detection configuration | | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | @@ -47,6 +49,8 @@ Agent( | `rtc` | `Optional[RtcConfig]` | `None` | RTC media encryption | | `filler_words` | `Optional[FillerWordsConfig]` | `None` | Filler words while waiting for LLM | +`pipeline_id` is an AI Studio base configuration. Explicit Agent config such as `with_llm()`, `with_tts()`, `with_stt()`, `with_mllm()`, `advanced_features`, and other builder options may send fields in `properties` that override the saved pipeline settings. Session-level `pipeline_id` overrides the agent-level value. + The Agent-level `instructions`, `greeting`, `failure_message`, `max_history`, and `greeting_configs` fields are compatibility shims. New code should configure those values on the LLM or MLLM vendor because that matches the core request schema. ## Builder Methods @@ -202,6 +206,8 @@ create_session( token: Optional[str] = None, idle_timeout: Optional[int] = None, enable_string_uid: Optional[bool] = None, + preset: Optional[Union[str, Sequence[str]]] = None, + pipeline_id: Optional[str] = None, expires_in: Optional[int] = None, ) -> AgentSession ``` @@ -219,6 +225,10 @@ Creates an `AgentSession` bound to the given client and channel. | `expires_in` | `Optional[int]` | No | Token lifetime in seconds (default: `86400` = 24 h, Agora max). Only applies when the token is auto-generated. Use `expires_in_hours()` or `expires_in_minutes()` for clarity. Valid range: 1–86400. | | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | +| `preset` | `Optional[Union[str, Sequence[str]]]` | No | Advanced preset value for project-specific routing | +| `pipeline_id` | `Optional[str]` | No | Published AI Studio pipeline ID for this session. Overrides `agent.pipeline_id`. | + +`pipeline_id` is sent as the top-level `/join` field `pipeline_id`, not inside `properties`. **Returns:** `AgentSession` diff --git a/docs/reference/session.md b/docs/reference/session.md index 63402f6..76e1367 100644 --- a/docs/reference/session.md +++ b/docs/reference/session.md @@ -33,6 +33,11 @@ AgentSession( token: Optional[str] = None, idle_timeout: Optional[int] = None, enable_string_uid: Optional[bool] = None, + preset: Optional[Union[str, Sequence[str]]] = None, + pipeline_id: Optional[str] = None, + expires_in: Optional[int] = None, + debug: Optional[bool] = None, + warn: Optional[Callable[[str], None]] = None, ) ``` @@ -51,6 +56,13 @@ AgentSession( | `token` | `Optional[str]` | No | Pre-built RTC token | | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | +| `preset` | `Optional[Union[str, Sequence[str]]]` | No | Advanced preset value for project-specific routing | +| `pipeline_id` | `Optional[str]` | No | Published AI Studio pipeline ID for this session. Overrides `agent.pipeline_id`. | +| `expires_in` | `Optional[int]` | No | Auto-generated token lifetime in seconds | +| `debug` | `Optional[bool]` | No | Enable debug logging of the start request | +| `warn` | `Optional[Callable[[str], None]]` | No | Custom warning sink | + +`pipeline_id` is sent as the top-level `/join` field `pipeline_id`, not inside `properties`. If unset, `AgentSession.start()` uses the agent-level value from `Agent(..., pipeline_id=...)`. ## Methods diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index fea1f0d..0a652db 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -343,8 +343,10 @@ def __init__( rtc: typing.Optional[RtcConfig] = None, filler_words: typing.Optional[FillerWordsConfig] = None, greeting_configs: typing.Optional[LlmGreetingConfigs] = None, + pipeline_id: typing.Optional[str] = None, ): self._name = name + self._pipeline_id = pipeline_id self._instructions = instructions self._greeting = greeting self._failure_message = failure_message @@ -609,6 +611,11 @@ def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, Se def name(self) -> typing.Optional[str]: return self._name + @property + def pipeline_id(self) -> typing.Optional[str]: + """Published AI Studio pipeline ID used as this agent's base configuration.""" + return self._pipeline_id + @property def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: return self._llm @@ -693,6 +700,7 @@ def filler_words(self) -> typing.Optional[FillerWordsConfig]: def config(self) -> typing.Dict[str, typing.Any]: return { "name": self._name, + "pipeline_id": self._pipeline_id, "instructions": self._instructions, "greeting": self._greeting, "failure_message": self._failure_message, @@ -945,6 +953,7 @@ def _resolve_turn_detection_config(self) -> TurnDetectionConfig: def _clone(self) -> "Agent": new_agent = Agent.__new__(Agent) new_agent._name = self._name + new_agent._pipeline_id = self._pipeline_id new_agent._llm = self._llm new_agent._tts = self._tts new_agent._stt = self._stt diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py index e113dc1..5c866ac 100644 --- a/src/agora_agent/agentkit/agent_session.py +++ b/src/agora_agent/agentkit/agent_session.py @@ -52,7 +52,8 @@ class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): Optional fields --------------- - app_certificate, token, idle_timeout, enable_string_uid, expires_in + app_certificate, token, idle_timeout, enable_string_uid, preset, + pipeline_id, expires_in, debug, warn """ app_certificate: str @@ -290,14 +291,18 @@ def _is_mllm_mode(self) -> bool: return True return mllm is not None - def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + def _build_start_properties( + self, + token_opts: typing.Dict[str, typing.Any], + skip_vendor_validation: bool, + ) -> typing.Dict[str, typing.Any]: base_properties = self._agent.to_properties( channel=self._channel, agent_uid=self._agent_uid, remote_uids=self._remote_uids, idle_timeout=self._idle_timeout, enable_string_uid=self._enable_string_uid, - skip_vendor_validation=True, + skip_vendor_validation=skip_vendor_validation, **token_opts, ) properties = self._dump_model(base_properties) @@ -445,6 +450,7 @@ def start(self) -> str: self._status = "starting" try: + pipeline_id = self._pipeline_id if self._pipeline_id is not None else self._agent.pipeline_id if self._token: token_opts: typing.Dict[str, typing.Any] = {"token": self._token} else: @@ -454,7 +460,7 @@ def start(self) -> str: "expires_in": self._expires_in, } - properties = self._build_start_properties(token_opts) + properties = self._build_start_properties(token_opts, skip_vendor_validation=bool(self._preset or pipeline_id)) resolved_preset, resolved_properties = resolve_session_presets( self._preset, properties, @@ -466,7 +472,7 @@ def start(self) -> str: "appid": self._app_id, "name": self._name, "preset": resolved_preset, - "pipeline_id": self._pipeline_id, + "pipeline_id": pipeline_id, "properties": resolved_properties, }) @@ -480,7 +486,7 @@ def start(self) -> str: name=self._name, properties=request_properties, preset=resolved_preset, - pipeline_id=self._pipeline_id, + pipeline_id=pipeline_id, request_options=self._request_options(), ) @@ -766,6 +772,7 @@ async def start(self) -> str: self._status = "starting" try: + pipeline_id = self._pipeline_id if self._pipeline_id is not None else self._agent.pipeline_id if self._token: token_opts: typing.Dict[str, typing.Any] = {"token": self._token} else: @@ -775,7 +782,7 @@ async def start(self) -> str: "expires_in": self._expires_in, } - properties = self._build_start_properties(token_opts) + properties = self._build_start_properties(token_opts, skip_vendor_validation=bool(self._preset or pipeline_id)) resolved_preset, resolved_properties = resolve_session_presets( self._preset, properties, @@ -787,7 +794,7 @@ async def start(self) -> str: "appid": self._app_id, "name": self._name, "preset": resolved_preset, - "pipeline_id": self._pipeline_id, + "pipeline_id": pipeline_id, "properties": resolved_properties, }) @@ -801,7 +808,7 @@ async def start(self) -> str: name=self._name, properties=request_properties, preset=resolved_preset, - pipeline_id=self._pipeline_id, + pipeline_id=pipeline_id, request_options=self._request_options(), ) diff --git a/tests/custom/test_pipeline_id.py b/tests/custom/test_pipeline_id.py new file mode 100644 index 0000000..c6c8c8f --- /dev/null +++ b/tests/custom/test_pipeline_id.py @@ -0,0 +1,123 @@ +import pytest + +from agora_agent import Agent + + +def dump(value): + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + +class StartResponse: + agent_id = "agent-id" + + +class FakeAgentsClient: + def __init__(self): + self.calls = [] + + def start(self, appid, **kwargs): + self.calls.append({"appid": appid, **kwargs}) + return StartResponse() + + +class FakeAsyncAgentsClient: + def __init__(self): + self.calls = [] + + async def start(self, appid, **kwargs): + self.calls.append({"appid": appid, **kwargs}) + return StartResponse() + + +class FakeClient: + app_id = "appid" + app_certificate = None + + def __init__(self, agents): + self.agents = agents + + +def start_agent(agent, **overrides): + agents = FakeAgentsClient() + client = FakeClient(agents) + options = { + "channel": "channel", + "token": "token", + "agent_uid": "1", + "remote_uids": ["100"], + **overrides, + } + + agent_id = agent.create_session(client, **options).start() + + assert agent_id == "agent-id" + assert len(agents.calls) == 1 + return agents.calls[0] + + +def test_agent_pipeline_id_sends_top_level_pipeline_id() -> None: + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + assert call["appid"] == "appid" + assert call["name"] == "support" + assert call["pipeline_id"] == "studio-pipeline-id" + properties = dump(call["properties"]) + assert properties["channel"] == "channel" + assert properties["token"] == "token" + assert properties["agent_rtc_uid"] == "1" + assert properties["remote_rtc_uids"] == ["100"] + + +def test_session_pipeline_id_overrides_agent_pipeline_id() -> None: + call = start_agent( + Agent(name="support", pipeline_id="agent-pipeline"), + pipeline_id="session-pipeline", + ) + + assert call["pipeline_id"] == "session-pipeline" + + +def test_agent_pipeline_id_skips_missing_vendor_validation() -> None: + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + assert call["pipeline_id"] == "studio-pipeline-id" + + +def test_pipeline_id_is_not_sent_inside_properties() -> None: + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + assert call["pipeline_id"] == "studio-pipeline-id" + assert "pipeline_id" not in dump(call["properties"]) + + +def test_pipeline_id_survives_builder_clone() -> None: + agent = Agent(name="support", pipeline_id="studio-pipeline-id").with_tools(True) + + assert agent.pipeline_id == "studio-pipeline-id" + call = start_agent(agent) + + assert call["pipeline_id"] == "studio-pipeline-id" + assert dump(call["properties"])["advanced_features"] == {"enable_tools": True} + + +@pytest.mark.asyncio +async def test_async_session_uses_agent_pipeline_id() -> None: + agents = FakeAsyncAgentsClient() + client = FakeClient(agents) + agent = Agent(name="support", pipeline_id="studio-pipeline-id") + + agent_id = await agent.create_async_session( + client, + channel="channel", + token="token", + agent_uid="1", + remote_uids=["100"], + ).start() + + assert agent_id == "agent-id" + assert agents.calls[0]["pipeline_id"] == "studio-pipeline-id" + assert "pipeline_id" not in dump(agents.calls[0]["properties"]) From 8e22e6d069e77f4c652e15f2f37945538c88c7c4 Mon Sep 17 00:00:00 2001 From: "Hermes (agora)" Date: Tue, 2 Jun 2026 15:36:16 -0400 Subject: [PATCH 26/26] udpated agent docs --- docs/reference/agent.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference/agent.md b/docs/reference/agent.md index 86d4fbd..5693e0b 100644 --- a/docs/reference/agent.md +++ b/docs/reference/agent.md @@ -34,7 +34,6 @@ Agent( | Parameter | Type | Default | Description | |---|---|---|---| | `name` | `Optional[str]` | `None` | Agent name, used as default session name | -| `pipeline_id` | `Optional[str]` | `None` | Published AI Studio pipeline ID used as this agent's base configuration | | `instructions` | `Optional[str]` | `None` | Deprecated. Use LLM vendor `system_messages` instead. | | `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Interaction language and turn detection configuration | | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | @@ -48,6 +47,7 @@ Agent( | `labels` | `Optional[Dict[str, str]]` | `None` | Custom key-value labels (returned in callbacks) | | `rtc` | `Optional[RtcConfig]` | `None` | RTC media encryption | | `filler_words` | `Optional[FillerWordsConfig]` | `None` | Filler words while waiting for LLM | +| `pipeline_id` | `Optional[str]` | `None` | Published AI Studio pipeline ID used as this agent's base configuration | `pipeline_id` is an AI Studio base configuration. Explicit Agent config such as `with_llm()`, `with_tts()`, `with_stt()`, `with_mllm()`, `advanced_features`, and other builder options may send fields in `properties` that override the saved pipeline settings. Session-level `pipeline_id` overrides the agent-level value.