diff --git a/.fern/replay.lock b/.fern/replay.lock index 536b6e4..a435ef4 100644 --- a/.fern/replay.lock +++ b/.fern/replay.lock @@ -6,5 +6,17992 @@ generations: timestamp: 2026-05-20T20:38:02.180Z cli_version: unknown generator_versions: {} -current_generation: a217c8ecfd919345831eebaca8295e292d65ebcf -patches: [] + - commit_sha: b66d871314ca0e5929cb9c9095949a7fd5e856a7 + tree_hash: db7756fbc0a5c6923371615dd752c8e17b2d828b + timestamp: 2026-06-04T20:30:41.901Z + cli_version: unknown + generator_versions: + fernapi/fern-python-sdk: 4.37.0 +current_generation: b66d871314ca0e5929cb9c9095949a7fd5e856a7 +patches: + - id: patch-6e30398b + content_hash: sha256:e99898e508e2d6cb9f134cc33e0b73c1c8acb845f5887924e0e38031a6e089c0 + original_commit: 6e30398b5dc6e8ff2681a442a4d6a49c7d866032 + original_message: "chore(agentkit): bump to v1.5.0 and expose v2.7 type aliases" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/__init__.py + - src/agora_agent/agentkit/vendors/__init__.py + patch_content: | + diff --git a/src/agora_agent/agentkit/__init__.py b/src/agora_agent/agentkit/__init__.py + index 1942bce..5ceda66 100644 + --- a/src/agora_agent/agentkit/__init__.py + +++ b/src/agora_agent/agentkit/__init__.py + @@ -1,13 +1,30 @@ + from .agent import ( + Agent, + + AgentConfig, + + AgentConfigUpdate, + + ConversationHistory, + + ConversationRole, + + ConversationSessionTurn, + + ConversationTurn, + + ConversationTurns, + StartAgentsRequestProperties, + + AvatarConfig, + + AvatarVendor, + GeofenceConfig, + + LlmConfig, + + LlmStyle, + + MllmConfig, + + MllmVendor, + RtcConfig, + + SttConfig, + + SttVendor, + + TtsConfig, + FillerWordsConfig, + FillerWordsTrigger, + FillerWordsTriggerFixedTimeConfig, + FillerWordsContent, + FillerWordsContentStaticConfig, + + FillerWordsContentSelectionRule, + TurnDetectionConfig, + TurnDetectionNestedConfig, + StartOfSpeechConfig, + @@ -37,9 +54,14 @@ from .agent import ( + InterruptionMode, + MllmTurnDetectionConfig, + MllmTurnDetectionMode, + + Labels, + LlmGreetingConfigs, + LlmGreetingConfigsMode, + McpServersItem, + + SessionInfo, + + SessionListResponse, + + SessionSummary, + + SpeakPriority, + ) + from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from ..agent_management.types.agent_think_agent_management_response import ( + @@ -57,8 +79,10 @@ from ..agent_management.types.agent_think_agent_management_request_on_speaking_a + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + @@ -112,6 +136,7 @@ from .vendors import ( + FishAudioTTS, + Gemini, + GeminiLive, + + GenericAvatar, + GoogleSTT, + GoogleTTS, + HeyGenAvatar, + @@ -132,14 +157,27 @@ from .vendors import ( + SarvamTTS, + SpeechmaticsSTT, + VertexAI, + + XaiGrok, + + XaiRealtime, + LiveAvatarAvatar, + ) + + __all__ = [ + "Agent", + + "AgentConfig", + + "AgentConfigUpdate", + # Return type of Agent.to_properties() + "StartAgentsRequestProperties", + # Top-level config types + + "LlmConfig", + + "LlmStyle", + + "SttConfig", + + "SttVendor", + + "TtsConfig", + + "MllmConfig", + + "MllmVendor", + + "AvatarConfig", + + "AvatarVendor", + "GeofenceConfig", + "RtcConfig", + "FillerWordsConfig", + @@ -147,6 +185,7 @@ __all__ = [ + "FillerWordsTriggerFixedTimeConfig", + "FillerWordsContent", + "FillerWordsContentStaticConfig", + + "FillerWordsContentSelectionRule", + # Turn detection types + "TurnDetectionConfig", + "TurnDetectionNestedConfig", + @@ -181,6 +220,7 @@ __all__ = [ + "InterruptionMode", + "MllmTurnDetectionConfig", + "MllmTurnDetectionMode", + + "Labels", + # Type-safe constants + "DataChannel", + "AudioScenario", + @@ -197,6 +237,15 @@ __all__ = [ + "AgentSession", + "AsyncAgentSession", + "AgentSessionOptions", + + "SessionInfo", + + "SessionListResponse", + + "SessionSummary", + + "ConversationHistory", + + "ConversationTurn", + + "ConversationRole", + + "ConversationTurns", + + "ConversationSessionTurn", + + "SpeakPriority", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + @@ -253,14 +302,19 @@ __all__ = [ + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + + "XaiGrok", + + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + + "GenericAvatar", + "is_heygen_avatar", + "is_live_avatar_avatar", + "is_akool_avatar", + "is_anam_avatar", + + "is_generic_avatar", + + "is_rtc_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + diff --git a/src/agora_agent/agentkit/vendors/__init__.py b/src/agora_agent/agentkit/vendors/__init__.py + index 0320843..689eab1 100644 + --- a/src/agora_agent/agentkit/vendors/__init__.py + +++ b/src/agora_agent/agentkit/vendors/__init__.py + @@ -11,9 +11,9 @@ from .base import ( + OpenAISampleRate, + SampleRate, + ) + -from .avatar import AkoolAvatar, AnamAvatar, HeyGenAvatar, LiveAvatarAvatar + +from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + -from .mllm import GeminiLive, OpenAIRealtime, VertexAI + +from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok, XaiRealtime + from .stt import ( + AmazonSTT, + AresSTT, + @@ -82,8 +82,11 @@ __all__ = [ + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + + "XaiGrok", + + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + + "GenericAvatar", + ] + theirs_snapshot: + src/agora_agent/agentkit/__init__.py: | + from .agent import ( + Agent, + AgentConfig, + AgentConfigUpdate, + ConversationHistory, + ConversationRole, + ConversationSessionTurn, + ConversationTurn, + ConversationTurns, + StartAgentsRequestProperties, + AvatarConfig, + AvatarVendor, + GeofenceConfig, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, + RtcConfig, + SttConfig, + SttVendor, + TtsConfig, + FillerWordsConfig, + FillerWordsTrigger, + FillerWordsTriggerFixedTimeConfig, + FillerWordsContent, + FillerWordsContentStaticConfig, + FillerWordsContentSelectionRule, + TurnDetectionConfig, + TurnDetectionNestedConfig, + StartOfSpeechConfig, + StartOfSpeechMode, + StartOfSpeechVadConfig, + StartOfSpeechKeywordsConfig, + StartOfSpeechDisabledConfig, + StartOfSpeechDisabledConfigStrategy, + EndOfSpeechConfig, + EndOfSpeechMode, + EndOfSpeechVadConfig, + EndOfSpeechSemanticConfig, + TurnDetectionType, + InterruptMode, + Eagerness, + SalConfig, + SalMode, + AdvancedFeatures, + SessionParams, + SessionParamsInput, + SilenceConfig, + SilenceAction, + FarewellConfig, + ParametersDataChannel, + ParametersAudioScenario, + InterruptionConfig, + InterruptionMode, + MllmTurnDetectionConfig, + MllmTurnDetectionMode, + Labels, + LlmGreetingConfigs, + LlmGreetingConfigsMode, + McpServersItem, + SessionInfo, + SessionListResponse, + SessionSummary, + SpeakPriority, + ) + from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .constants import ( + DataChannel, + AudioScenario, + SilenceActionValues, + SalModeValues, + GeofenceArea, + GeofenceExcludeArea, + FillerWordsSelectionRule, + TurnDetectionTypeValues, + ) + from .token import ( + GenerateConvoAITokenOptions, + GenerateTokenOptions, + MAX_EXPIRY_SECONDS, + generate_convo_ai_token, + generate_rtc_token, + expires_in_hours, + expires_in_minutes, + ) + from .presets import ( + AgentPresets, + DeepgramPresetModels, + MiniMaxPresetModels, + OpenAIPresetModels, + OpenAITtsPresetModels, + normalize_preset_input, + ) + from .vendors import ( + AkoolAvatar, + AmazonSTT, + AmazonTTS, + AnamAvatar, + Anthropic, + AresSTT, + AssemblyAISTT, + AzureOpenAI, + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + CartesiaTTS, + DeepgramSTT, + DeepgramTTS, + ElevenLabsSampleRate, + ElevenLabsTTS, + FishAudioTTS, + Gemini, + GeminiLive, + GenericAvatar, + GoogleSTT, + GoogleTTS, + HeyGenAvatar, + HumeAITTS, + MicrosoftSampleRate, + MicrosoftSTT, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAI, + OpenAIRealtime, + OpenAISampleRate, + OpenAISTT, + OpenAITTS, + RimeTTS, + SampleRate, + SarvamSTT, + SarvamTTS, + SpeechmaticsSTT, + VertexAI, + XaiGrok, + XaiRealtime, + LiveAvatarAvatar, + ) + + __all__ = [ + "Agent", + "AgentConfig", + "AgentConfigUpdate", + # Return type of Agent.to_properties() + "StartAgentsRequestProperties", + # Top-level config types + "LlmConfig", + "LlmStyle", + "SttConfig", + "SttVendor", + "TtsConfig", + "MllmConfig", + "MllmVendor", + "AvatarConfig", + "AvatarVendor", + "GeofenceConfig", + "RtcConfig", + "FillerWordsConfig", + "FillerWordsTrigger", + "FillerWordsTriggerFixedTimeConfig", + "FillerWordsContent", + "FillerWordsContentStaticConfig", + "FillerWordsContentSelectionRule", + # Turn detection types + "TurnDetectionConfig", + "TurnDetectionNestedConfig", + "StartOfSpeechConfig", + "StartOfSpeechMode", + "StartOfSpeechVadConfig", + "StartOfSpeechKeywordsConfig", + "StartOfSpeechDisabledConfig", + "StartOfSpeechDisabledConfigStrategy", + "EndOfSpeechConfig", + "EndOfSpeechMode", + "EndOfSpeechVadConfig", + "EndOfSpeechSemanticConfig", + # Deprecated turn detection types + "TurnDetectionType", + "InterruptMode", + "Eagerness", + # SAL types + "SalConfig", + "SalMode", + # Advanced features + "AdvancedFeatures", + # Session parameters types + "SessionParams", + "SessionParamsInput", + "SilenceConfig", + "SilenceAction", + "FarewellConfig", + "ParametersDataChannel", + "ParametersAudioScenario", + "InterruptionConfig", + "InterruptionMode", + "MllmTurnDetectionConfig", + "MllmTurnDetectionMode", + "Labels", + # Type-safe constants + "DataChannel", + "AudioScenario", + "SilenceActionValues", + "SalModeValues", + "GeofenceArea", + "GeofenceExcludeArea", + "FillerWordsSelectionRule", + "TurnDetectionTypeValues", + # LLM sub-types + "LlmGreetingConfigs", + "LlmGreetingConfigsMode", + "McpServersItem", + "AgentSession", + "AsyncAgentSession", + "AgentSessionOptions", + "SessionInfo", + "SessionListResponse", + "SessionSummary", + "ConversationHistory", + "ConversationTurn", + "ConversationRole", + "ConversationTurns", + "ConversationSessionTurn", + "SpeakPriority", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + "AgentThinkRequestOnSpeakingAction", + "AgentPresets", + "DeepgramPresetModels", + "OpenAIPresetModels", + "OpenAITtsPresetModels", + "MiniMaxPresetModels", + "normalize_preset_input", + "generate_rtc_token", + "GenerateTokenOptions", + "generate_convo_ai_token", + "GenerateConvoAITokenOptions", + "MAX_EXPIRY_SECONDS", + "expires_in_hours", + "expires_in_minutes", + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + "is_heygen_avatar", + "is_live_avatar_avatar", + "is_akool_avatar", + "is_anam_avatar", + "is_generic_avatar", + "is_rtc_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + src/agora_agent/agentkit/vendors/__init__.py: | + from .base import ( + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + ElevenLabsSampleRate, + GoogleTTSSampleRate, + MicrosoftSampleRate, + OpenAISampleRate, + SampleRate, + ) + from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok, XaiRealtime + from .stt import ( + AmazonSTT, + AresSTT, + AssemblyAISTT, + DeepgramSTT, + GoogleSTT, + MicrosoftSTT, + OpenAISTT, + SarvamSTT, + SpeechmaticsSTT, + ) + from .tts import ( + AmazonTTS, + CartesiaTTS, + DeepgramTTS, + ElevenLabsTTS, + FishAudioTTS, + GoogleTTS, + HumeAITTS, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAITTS, + RimeTTS, + SarvamTTS, + ) + + __all__ = [ + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "GoogleTTSSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + ] + status: unresolved + - id: patch-9df782b4 + content_hash: sha256:84c08fe3239d2ecb0b0a3ddd33b0dce4e7b012125be797aa83ca12893363b565 + original_commit: 9df782b46d872599f103078e30c5ded2053f2517 + original_message: "feat(agentkit): update MLLM and LLM vendor wrappers for v2.7" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/vendors/llm.py + - src/agora_agent/agentkit/vendors/mllm.py + patch_content: |+ + From 9df782b46d872599f103078e30c5ded2053f2517 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 20 May 2026 20:57:54 -0400 + Subject: [PATCH] feat(agentkit): update MLLM and LLM vendor wrappers for v2.7 + + Adds xAI Grok Realtime and Vertex AI MLLM wrappers, and aligns MLLM + config serialization with the generated core types. LLM vendors now + accept typed greeting_configs and serialize them through the generated + model shape, including interruptable. + --- + src/agora_agent/agentkit/vendors/llm.py | 31 ++++-- + src/agora_agent/agentkit/vendors/mllm.py | 118 +++++++++++++++++------ + 2 files changed, 113 insertions(+), 36 deletions(-) + + diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py + index 7465c9f..6f74b43 100644 + --- a/src/agora_agent/agentkit/vendors/llm.py + +++ b/src/agora_agent/agentkit/vendors/llm.py + @@ -1,9 +1,14 @@ + -from typing import Any, Dict, List, Optional + +from typing import Any, Dict, List, Optional, Union + + from pydantic import BaseModel, ConfigDict, Field + + +from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + + StartAgentsRequestPropertiesLlmGreetingConfigs, + +) + from .base import BaseLLM + + +LlmGreetingConfigs = Union[StartAgentsRequestPropertiesLlmGreetingConfigs, Dict[str, Any]] + + + + def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Ensure each MCP server has transport set (API requires it). Default to streamable_http.""" + @@ -15,6 +20,14 @@ def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]] + result.append(item) + return result + + + + +def _dump_optional_model(value: Any) -> Any: + + if hasattr(value, "model_dump"): + + return value.model_dump(exclude_none=True) + + if hasattr(value, "dict"): + + return value.dict(exclude_none=True) + + return value + + + class OpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + @@ -31,7 +44,7 @@ class OpenAIOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -74,7 +87,7 @@ class OpenAI(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + @@ -104,7 +117,7 @@ class AzureOpenAIOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -150,7 +163,7 @@ class AzureOpenAI(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.mcp_servers is not None: + @@ -177,7 +190,7 @@ class AnthropicOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -216,7 +229,7 @@ class Anthropic(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + @@ -246,7 +259,7 @@ class GeminiOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -287,7 +300,7 @@ class Gemini(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py + index 5f6f940..cd6cd07 100644 + --- a/src/agora_agent/agentkit/vendors/mllm.py + +++ b/src/agora_agent/agentkit/vendors/mllm.py + @@ -1,3 +1,4 @@ + +import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + @@ -22,9 +23,7 @@ class OpenAIRealtimeOptions(BaseModel): + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + @@ -53,18 +52,97 @@ class OpenAIRealtime(BaseMLLM): + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + - if self.options.predefined_tools is not None: + - config["predefined_tools"] = self.options.predefined_tools + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + - if self.options.max_history is not None: + - config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + +# xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + +# is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + + + +class XaiGrokOptions(BaseModel): + + model_config = ConfigDict(extra="forbid") + + + + api_key: str = Field(..., description="xAI API key") + + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + + + +class XaiGrok(BaseMLLM): + + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + + + def __init__(self, **kwargs: Any): + + self.options = XaiGrokOptions(**kwargs) + + + + def to_config(self) -> Dict[str, Any]: + + params: Dict[str, Any] = dict(self.options.params or {}) + + if self.options.voice is not None: + + params["voice"] = self.options.voice + + if self.options.language is not None: + + params["language"] = self.options.language + + if self.options.sample_rate is not None: + + params["sample_rate"] = self.options.sample_rate + + + + config: Dict[str, Any] = { + + "vendor": "xai", + + "api_key": self.options.api_key, + + "url": self.options.url, + + "params": params, + + } + + + + if self.options.greeting_message is not None: + + config["greeting_message"] = self.options.greeting_message + + if self.options.input_modalities is not None: + + config["input_modalities"] = self.options.input_modalities + + if self.options.output_modalities is not None: + + config["output_modalities"] = self.options.output_modalities + + if self.options.messages is not None: + + config["messages"] = self.options.messages + + if self.options.failure_message is not None: + + config["failure_message"] = self.options.failure_message + + if self.options.turn_detection is not None: + + config["turn_detection"] = self.options.turn_detection + + + + return config + + + + + +class XaiRealtimeOptions(XaiGrokOptions): + + """Deprecated: use :class:`XaiGrokOptions` instead.""" + + + + def __init__(self, **data: Any): + + warnings.warn( + + "XaiRealtimeOptions is deprecated; use XaiGrokOptions instead.", + + DeprecationWarning, + + stacklevel=2, + + ) + + super().__init__(**data) + + + + + +class XaiRealtime(XaiGrok): + + """Deprecated: use :class:`XaiGrok` instead.""" + + + + def __init__(self, **kwargs: Any): + + warnings.warn( + + "XaiRealtime is deprecated; use XaiGrok instead.", + + DeprecationWarning, + + stacklevel=2, + + ) + + super().__init__(**kwargs) + + + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + @@ -81,28 +159,24 @@ class VertexAIOptions(BaseModel): + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + - params: Dict[str, Any] = { + - "model": self.options.model, + - "project_id": self.options.project_id, + - "location": self.options.location, + - "adc_credentials_string": self.options.adc_credentials_string, + - } + - + + # additional_params spread first so that explicit fields always win, + + # matching the TypeScript SDK. + + params: Dict[str, Any] = dict(self.options.additional_params or {}) + + params["model"] = self.options.model + + params["project_id"] = self.options.project_id + + params["location"] = self.options.location + + params["adc_credentials_string"] = self.options.adc_credentials_string + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + - if self.options.additional_params is not None: + - params.update(self.options.additional_params) + + config: Dict[str, Any] = { + "vendor": "vertexai", + @@ -119,12 +193,8 @@ class VertexAI(BaseMLLM): + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + - if self.options.predefined_tools is not None: + - config["predefined_tools"] = self.options.predefined_tools + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + - if self.options.max_history is not None: + - config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + @@ -145,9 +215,7 @@ class GeminiLiveOptions(BaseModel): + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + @@ -179,12 +247,8 @@ class GeminiLive(BaseMLLM): + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + - if self.options.predefined_tools is not None: + - config["predefined_tools"] = self.options.predefined_tools + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + - if self.options.max_history is not None: + - config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/vendors/llm.py: | + from typing import Any, Dict, List, Optional, Union + + from pydantic import BaseModel, ConfigDict, Field + + from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + StartAgentsRequestPropertiesLlmGreetingConfigs, + ) + from .base import BaseLLM + + LlmGreetingConfigs = Union[StartAgentsRequestPropertiesLlmGreetingConfigs, Dict[str, Any]] + + + def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Ensure each MCP server has transport set (API requires it). Default to streamable_http.""" + result = [] + for s in servers: + item = dict(s) + if item.get("transport") is None: + item["transport"] = "streamable_http" + result.append(item) + return result + + + def _dump_optional_model(value: Any) -> Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + class OpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: Optional[str] = Field(default=None, description="OpenAI API key") + model: str = Field(default="gpt-4o-mini", description="Model name") + base_url: Optional[str] = Field(default=None, description="Custom base URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + max_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class OpenAI(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # model is the default; explicit params entries extend/override it. + # This matches the TS SDK behaviour: { model, ...params }. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + + # Named fields take precedence over anything in the generic params dict. + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.base_url or "https://api.openai.com/v1/chat/completions", + "params": params, + "style": "openai", + "input_modalities": self.options.input_modalities or ["text"], + } + if self.options.api_key is not None: + config["api_key"] = self.options.api_key + if self.options.headers is not None: + config["headers"] = self.options.headers + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class AzureOpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Azure OpenAI API key") + endpoint: str = Field(..., description="Azure endpoint URL") + deployment_name: str = Field(..., description="Azure deployment name") + api_version: str = Field(default="2024-08-01-preview", description="Azure API version") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + max_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class AzureOpenAI(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AzureOpenAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + url = ( + f"{self.options.endpoint}/openai/deployments/" + f"{self.options.deployment_name}/chat/completions" + f"?api-version={self.options.api_version}" + ) + config: Dict[str, Any] = { + "url": url, + "api_key": self.options.api_key, + "vendor": self.options.vendor or "azure", + "style": "openai", + "input_modalities": self.options.input_modalities or ["text"], + } + + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if params: + config["params"] = params + if self.options.headers is not None: + config["headers"] = self.options.headers + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class AnthropicOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anthropic API key") + model: str = Field(default="claude-3-5-sonnet-20241022", description="Model name") + url: Optional[str] = Field(default=None, description="Custom API endpoint URL") + max_tokens: Optional[int] = Field(default=None, gt=0) + temperature: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class Anthropic(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AnthropicOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.url or "https://api.anthropic.com/v1/messages", + "api_key": self.options.api_key, + "params": params, + "style": "anthropic", + "input_modalities": self.options.input_modalities or ["text"], + } + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class GeminiOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google AI API key") + model: str = Field(default="gemini-2.0-flash-exp", description="Model name") + url: Optional[str] = Field(default=None, description="Custom API endpoint URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_k: Optional[int] = Field(default=None, gt=0) + max_output_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class Gemini(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + if self.options.top_k is not None: + params["top_k"] = self.options.top_k + if self.options.max_output_tokens is not None: + params["max_output_tokens"] = self.options.max_output_tokens + + config: Dict[str, Any] = { + "url": self.options.url or "https://generativelanguage.googleapis.com/v1beta/models", + "api_key": self.options.api_key, + "params": params, + "style": "gemini", + "input_modalities": self.options.input_modalities or ["text"], + } + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + src/agora_agent/agentkit/vendors/mllm.py: | + import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + + from ...agents.types.start_agents_request_properties_mllm_turn_detection import ( + StartAgentsRequestPropertiesMllmTurnDetection, + ) + from .base import BaseMLLM + + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + + + class OpenAIRealtimeOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="OpenAI API key") + model: Optional[str] = Field(default=None, description="Model name (e.g., gpt-4o-realtime-preview)") + url: Optional[str] = Field(default=None, description="WebSocket URL") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIRealtimeOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config: Dict[str, Any] = { + "vendor": "openai", + "api_key": self.options.api_key, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.model is not None: + params = {"model": self.options.model} + if self.options.params is not None: + params.update(self.options.params) + config["params"] = params + elif self.options.params is not None: + config["params"] = self.options.params + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + # xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + # is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + class XaiGrokOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="xAI API key") + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + class XaiGrok(BaseMLLM): + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + def __init__(self, **kwargs: Any): + self.options = XaiGrokOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.language is not None: + params["language"] = self.options.language + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate + + config: Dict[str, Any] = { + "vendor": "xai", + "api_key": self.options.api_key, + "url": self.options.url, + "params": params, + } + + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class XaiRealtimeOptions(XaiGrokOptions): + """Deprecated: use :class:`XaiGrokOptions` instead.""" + + def __init__(self, **data: Any): + warnings.warn( + "XaiRealtimeOptions is deprecated; use XaiGrokOptions instead.", + DeprecationWarning, + stacklevel=2, + ) + super().__init__(**data) + + + class XaiRealtime(XaiGrok): + """Deprecated: use :class:`XaiGrok` instead.""" + + def __init__(self, **kwargs: Any): + warnings.warn( + "XaiRealtime is deprecated; use XaiGrok instead.", + DeprecationWarning, + stacklevel=2, + ) + super().__init__(**kwargs) + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + model: str = Field(..., description="Model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud location/region") + adc_credentials_string: str = Field(..., description="Application Default Credentials JSON string") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name (e.g., Aoede, Charon)") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # additional_params spread first so that explicit fields always win, + # matching the TypeScript SDK. + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["model"] = self.options.model + params["project_id"] = self.options.project_id + params["location"] = self.options.location + params["adc_credentials_string"] = self.options.adc_credentials_string + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "vertexai", + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class GeminiLiveOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google API key") + model: str = Field(..., description="Gemini Live model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiLiveOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = {} + if self.options.additional_params is not None: + params.update(self.options.additional_params) + params["model"] = self.options.model + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "gemini", + "api_key": self.options.api_key, + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + status: unresolved + - id: patch-26706d73 + content_hash: sha256:a9551e0b774b96e7734e9faa7d770611861cf443837428272ef75710447238da + original_commit: 26706d73ae15d860d57daf926837632c01be7f10 + original_message: "feat(agentkit): add GenericAvatar and session-aware avatar validation" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/avatar_types.py + - src/agora_agent/agentkit/vendors/avatar.py + patch_content: |+ + From 26706d73ae15d860d57daf926837632c01be7f10 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 20 May 2026 20:59:22 -0400 + Subject: [PATCH] feat(agentkit): add GenericAvatar and session-aware avatar + validation + + Adds the GenericAvatar vendor wrapper and extends avatar validation + helpers for generic and RTC-backed avatars. Session-derived fields such + as agora_appid, agora_channel, and agora_token can now be validated + after AgentSession enrichment. + --- + src/agora_agent/agentkit/avatar_types.py | 35 +++++++++++++++++- + src/agora_agent/agentkit/vendors/avatar.py | 42 ++++++++++++++++++++++ + 2 files changed, 76 insertions(+), 1 deletion(-) + + diff --git a/src/agora_agent/agentkit/avatar_types.py b/src/agora_agent/agentkit/avatar_types.py + index 9e132a9..a04809c 100644 + --- a/src/agora_agent/agentkit/avatar_types.py + +++ b/src/agora_agent/agentkit/avatar_types.py + @@ -17,7 +17,21 @@ def is_anam_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "anam" + + + -def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: + +def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + + return config.get("vendor") == "generic" + + + + + +def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + + params = config.get("params", {}) + + return isinstance(params, dict) and bool(params.get("agora_uid")) and ( + + is_heygen_avatar(config) or is_live_avatar_avatar(config) or is_generic_avatar(config) + + ) + + + + + +def validate_avatar_config( + + config: typing.Dict[str, typing.Any], + + require_session_fields: bool = False, + +) -> None: + """Validates avatar configuration at runtime. + + Parameters + @@ -45,6 +59,8 @@ def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: + f"Invalid quality for {label}: {params.get('quality')}. " + f"Must be one of: {', '.join(valid_qualities)}" + ) + + if require_session_fields and not params.get("agora_token"): + + raise ValueError(f"{label} avatar requires agora_token after session enrichment") + elif is_akool_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + @@ -53,6 +69,23 @@ def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Anam avatar requires api_key") + + elif is_generic_avatar(config): + + params = config.get("params", {}) + + if not params.get("api_key"): + + raise ValueError("Generic avatar requires api_key") + + if not params.get("api_base_url"): + + raise ValueError("Generic avatar requires api_base_url") + + if not params.get("avatar_id"): + + raise ValueError("Generic avatar requires avatar_id") + + if not params.get("agora_uid"): + + raise ValueError("Generic avatar requires agora_uid") + + if require_session_fields: + + if not params.get("agora_token"): + + raise ValueError("Generic avatar requires agora_token after session enrichment") + + if not params.get("agora_appid"): + + raise ValueError("Generic avatar requires agora_appid after session enrichment") + + if not params.get("agora_channel"): + + raise ValueError("Generic avatar requires agora_channel after session enrichment") + + + def validate_tts_sample_rate( + diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py + index b83a356..00cad8f 100644 + --- a/src/agora_agent/agentkit/vendors/avatar.py + +++ b/src/agora_agent/agentkit/vendors/avatar.py + @@ -132,6 +132,48 @@ class LiveAvatarAvatar(BaseAvatar): + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + +class GenericAvatarOptions(BaseModel): + + model_config = ConfigDict(extra="forbid") + + + + api_key: str = Field(..., description="Generic avatar provider API key") + + api_base_url: str = Field(..., description="Avatar provider API base URL") + + avatar_id: str = Field(..., description="Avatar ID") + + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + +class GenericAvatar(BaseAvatar): + + def __init__(self, **kwargs: Any): + + self.options = GenericAvatarOptions(**kwargs) + + + + @property + + def required_sample_rate(self) -> int: + + return 0 + + + + def to_config(self) -> Dict[str, Any]: + + params: Dict[str, Any] = { + + "api_key": self.options.api_key, + + "api_base_url": self.options.api_base_url, + + "avatar_id": self.options.avatar_id, + + "agora_uid": self.options.agora_uid, + + } + + + + if self.options.agora_appid is not None: + + params["agora_appid"] = self.options.agora_appid + + if self.options.agora_token is not None: + + params["agora_token"] = self.options.agora_token + + if self.options.agora_channel is not None: + + params["agora_channel"] = self.options.agora_channel + + if self.options.additional_params is not None: + + params = {**self.options.additional_params, **params} + + + + enable = self.options.enable if self.options.enable is not None else True + + return {"enable": enable, "vendor": "generic", "params": params} + + + + + class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/avatar_types.py: | + import typing + + + def is_heygen_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "heygen" + + + def is_live_avatar_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "liveavatar" + + + def is_akool_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "akool" + + + def is_anam_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "anam" + + + def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "generic" + + + def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + params = config.get("params", {}) + return isinstance(params, dict) and bool(params.get("agora_uid")) and ( + is_heygen_avatar(config) or is_live_avatar_avatar(config) or is_generic_avatar(config) + ) + + + def validate_avatar_config( + config: typing.Dict[str, typing.Any], + require_session_fields: bool = False, + ) -> None: + """Validates avatar configuration at runtime. + + Parameters + ---------- + config : dict + The avatar configuration dictionary. + + Raises + ------ + ValueError + If the configuration is invalid. + """ + if is_heygen_avatar(config) or is_live_avatar_avatar(config): + label = "HeyGen" if is_heygen_avatar(config) else "LiveAvatar" + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError(f"{label} avatar requires api_key") + if not params.get("quality"): + raise ValueError(f"{label} avatar requires quality (low, medium, or high)") + if not params.get("agora_uid"): + raise ValueError(f"{label} avatar requires agora_uid") + valid_qualities = ("low", "medium", "high") + if params.get("quality") not in valid_qualities: + raise ValueError( + f"Invalid quality for {label}: {params.get('quality')}. " + f"Must be one of: {', '.join(valid_qualities)}" + ) + if require_session_fields and not params.get("agora_token"): + raise ValueError(f"{label} avatar requires agora_token after session enrichment") + elif is_akool_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Akool avatar requires api_key") + elif is_anam_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Anam avatar requires api_key") + elif is_generic_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Generic avatar requires api_key") + if not params.get("api_base_url"): + raise ValueError("Generic avatar requires api_base_url") + if not params.get("avatar_id"): + raise ValueError("Generic avatar requires avatar_id") + if not params.get("agora_uid"): + raise ValueError("Generic avatar requires agora_uid") + if require_session_fields: + if not params.get("agora_token"): + raise ValueError("Generic avatar requires agora_token after session enrichment") + if not params.get("agora_appid"): + raise ValueError("Generic avatar requires agora_appid after session enrichment") + if not params.get("agora_channel"): + raise ValueError("Generic avatar requires agora_channel after session enrichment") + + + def validate_tts_sample_rate( + avatar_config: typing.Dict[str, typing.Any], + tts_sample_rate: int, + ) -> None: + """Validates that TTS sample rate is compatible with the avatar vendor. + + Different avatar vendors have specific sample rate requirements: + - HeyGen: ONLY supports 24,000 Hz + - Akool: ONLY supports 16,000 Hz + + Parameters + ---------- + avatar_config : dict + The avatar configuration dictionary. + tts_sample_rate : int + The sample rate from your TTS configuration (in Hz). + + Raises + ------ + ValueError + If TTS sample rate is incompatible with the avatar vendor. + """ + if is_heygen_avatar(avatar_config) or is_live_avatar_avatar(avatar_config): + if tts_sample_rate != 24000: + label = "HeyGen" if is_heygen_avatar(avatar_config) else "LiveAvatar" + raise ValueError( + f"{label} avatars ONLY support 24,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 24kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/overview" + ) + elif is_akool_avatar(avatar_config): + if tts_sample_rate != 16000: + raise ValueError( + f"Akool avatars ONLY support 16,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 16kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/akool" + ) + src/agora_agent/agentkit/vendors/avatar.py: | + import warnings + from typing import Any, Dict, Optional + + from pydantic import BaseModel, ConfigDict, Field, field_validator + + from .base import BaseAvatar + + HEYGEN_SAMPLE_RATE = 24000 + LIVEAVATAR_SAMPLE_RATE = 24000 + AKOOL_SAMPLE_RATE = 16000 + + + class HeyGenAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="HeyGen API key") + quality: str = Field(..., description="Avatar quality: low, medium, or high") + agora_uid: str = Field(..., description="Agora UID for the avatar stream") + agora_token: Optional[str] = Field(default=None, description="RTC token for avatar authentication") + avatar_id: Optional[str] = Field(default=None, description="HeyGen avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + disable_idle_timeout: Optional[bool] = Field(default=None, description="Whether to disable idle timeout") + activity_idle_timeout: Optional[int] = Field(default=None, description="Idle timeout in seconds") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + @field_validator("quality") + @classmethod + def validate_quality(cls, v: str) -> str: + valid = ("low", "medium", "high") + if v not in valid: + raise ValueError(f"Invalid quality '{v}'. Must be one of: {', '.join(valid)}") + return v + + class HeyGenAvatar(BaseAvatar): + """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + def __init__(self, **kwargs: Any): + warnings.warn( + "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + DeprecationWarning, + stacklevel=2, + ) + self.options = HeyGenAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return HEYGEN_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "heygen", "params": params} + + + class AkoolAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Akool API key") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + class AkoolAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AkoolAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return AKOOL_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "akool", "params": params} + + + class LiveAvatarAvatarOptions(HeyGenAvatarOptions): + pass + + + class LiveAvatarAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = LiveAvatarAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return LIVEAVATAR_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + class GenericAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Generic avatar provider API key") + api_base_url: str = Field(..., description="Avatar provider API base URL") + avatar_id: str = Field(..., description="Avatar ID") + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "api_base_url": self.options.api_base_url, + "avatar_id": self.options.avatar_id, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_appid is not None: + params["agora_appid"] = self.options.agora_appid + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.agora_channel is not None: + params["agora_channel"] = self.options.agora_channel + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "generic", "params": params} + + + class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anam API key") + persona_id: Optional[str] = Field(default=None, description="Persona ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + class AnamAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AnamAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.persona_id is not None: + params["persona_id"] = self.options.persona_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "anam", "params": params} + status: unresolved + - id: patch-9f491c63 + content_hash: sha256:d9811b2c5927be74f2125444dcf36642b88ad7be422019688cb0228093dce1d0 + original_commit: 9f491c63a964c13c67ba4af3708379e1b75a92d8 + original_message: "feat(agentkit): update Agent builder and session lifecycle for v2.7" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + patch_content: |+ + From 9f491c63a964c13c67ba4af3708379e1b75a92d8 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 20 May 2026 21:00:58 -0400 + Subject: [PATCH] feat(agentkit): update Agent builder and session lifecycle + for v2.7 + + Aligns Agent and AgentSession with the generated v2.7 request shape. + MLLM sessions no longer require TTS, LLM, or STT, and enabled avatars + are rejected when MLLM is configured. AgentSession now enriches generic + and RTC avatars with session context, auto-generates avatar tokens, + validates TTS sample rates from vendor-specific fields, and adds + paginated get_turns/get_all_turns helpers with fail-fast pagination + guards. + --- + src/agora_agent/agentkit/agent.py | 164 +++++++++++++-- + src/agora_agent/agentkit/agent_session.py | 231 ++++++++++++++++++++-- + 2 files changed, 360 insertions(+), 35 deletions(-) + + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index 70a1bdd..86a958e 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -8,6 +8,24 @@ if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + +from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + +from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + +from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + +from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + +from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + +from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + +from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + +from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + +from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + +from ..agents.types.get_agents_response import GetAgentsResponse + +from ..agents.types.list_agents_response import ListAgentsResponse + +from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + +from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + +from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + +from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + +from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + +from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + +from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + @@ -46,10 +64,21 @@ from ..agents.types.start_agents_request_properties_filler_words_trigger import + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + +from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + +from ..types.tts import Tts + from .token import generate_convo_ai_token, _validate_expires_in + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + +LlmConfig = StartAgentsRequestPropertiesLlm + +LlmStyle = StartAgentsRequestPropertiesLlmStyle + +SttConfig = StartAgentsRequestPropertiesAsr + +SttVendor = StartAgentsRequestPropertiesAsrVendor + +TtsConfig = Tts + +MllmConfig = StartAgentsRequestPropertiesMllm + +MllmVendor = StartAgentsRequestPropertiesMllmVendor + +AvatarConfig = StartAgentsRequestPropertiesAvatar + +AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + @@ -93,6 +122,18 @@ InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + +AgentConfig = StartAgentsRequestProperties + +AgentConfigUpdate = UpdateAgentsRequestProperties + +SessionInfo = GetAgentsResponse + +SessionListResponse = ListAgentsResponse + +SessionSummary = ListAgentsResponseDataListItem + +ConversationHistory = GetHistoryAgentsResponse + +ConversationTurn = GetHistoryAgentsResponseContentsItem + +ConversationRole = GetHistoryAgentsResponseContentsItemRole + +ConversationTurns = GetTurnsAgentsResponse + +ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + +SpeakPriority = SpeakAgentsRequestPriority + +Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + @@ -116,6 +157,7 @@ FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + +FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + + class Agent: + @@ -183,9 +225,20 @@ class Agent: + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + + sample_rate = vendor.sample_rate + + if ( + + self._avatar_required_sample_rate not in (None, 0) + + and sample_rate is not None + + and sample_rate != self._avatar_required_sample_rate + + ): + + raise ValueError( + + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + + f"but TTS is configured with {sample_rate} Hz. " + + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + - new_agent._tts_sample_rate = vendor.sample_rate + + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + @@ -194,6 +247,9 @@ class Agent: + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + + # Note: avatars are not supported with MLLM. The combination is rejected + + # at ``to_properties`` / ``AgentSession.start`` so callers can still + + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + @@ -202,7 +258,10 @@ class Agent: + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + - advanced_features_model = new_agent._advanced_features.model_copy(update={"enable_mllm": None}) + + advanced_features_model = self._copy_model_update( + + new_agent._advanced_features, + + {"enable_mllm": None}, + + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + @@ -214,6 +273,10 @@ class Agent: + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + + # Note: avatars are not supported with MLLM. The combination is rejected + + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + + # enabled) so callers may still combine the two for testing or for the + + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + @@ -282,7 +345,10 @@ class Agent: + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + - new_agent._advanced_features = new_agent._advanced_features.model_copy(update={"enable_tools": enabled}) + + new_agent._advanced_features = self._copy_model_update( + + new_agent._advanced_features, + + {"enable_tools": enabled}, + + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + @@ -294,6 +360,23 @@ class Agent: + new_agent._parameters = parameters + return new_agent + + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + + """Returns a new Agent with the specified RTC audio scenario.""" + + new_agent = self._clone() + + if new_agent._parameters is None: + + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + + elif isinstance(new_agent._parameters, dict): + + new_agent._parameters = typing.cast( + + SessionParamsInput, + + {**new_agent._parameters, "audio_scenario": audio_scenario}, + + ) + + else: + + new_agent._parameters = self._copy_model_update( + + new_agent._parameters, + + {"audio_scenario": audio_scenario}, + + ) + + return new_agent + + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + @@ -342,6 +425,33 @@ class Agent: + new_agent._filler_words = filler_words + return new_agent + + + @staticmethod + + def _field_value(value: typing.Any, field: str) -> typing.Any: + + if value is None: + + return None + + if isinstance(value, dict): + + return value.get(field) + + return getattr(value, field, None) + + + + @staticmethod + + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + + if hasattr(value, "model_copy"): + + return value.model_copy(update=update) + + if hasattr(value, "copy"): + + return value.copy(update=update) + + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + + data_channel = self._field_value(self._parameters, "data_channel") + + if not enable_rtm or data_channel is not None: + + return self._parameters + + if self._parameters is None: + + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + + if isinstance(self._parameters, dict): + + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + + @property + def name(self) -> typing.Optional[str]: + return self._name + @@ -354,6 +464,10 @@ class Agent: + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + + @property + + def tts_sample_rate(self) -> typing.Optional[int]: + + return self._tts_sample_rate + + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + @@ -536,6 +650,20 @@ class Agent: + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + + # Validate the MLLM + enabled-avatar combination BEFORE generating the + + # RTC token so callers get a clear, actionable error first (matches the + + # TypeScript and Go SDKs' fail-fast contract). + + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + + avatar_enabled = ( + + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + + ) + + if is_mllm_mode and avatar_enabled: + + raise ValueError( + + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + + ) + + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + @@ -553,9 +681,6 @@ class Agent: + **token_kwargs, + ) + + - mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + - is_mllm_mode = bool(mllm_flag or self._mllm is not None) + - + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + @@ -579,11 +704,12 @@ class Agent: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + - if self._parameters is not None: + - if isinstance(self._parameters, dict): + - base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**self._parameters) + + parameters = self._resolved_parameters() + + if parameters is not None: + + if isinstance(parameters, dict): + + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + - base_kwargs["parameters"] = self._parameters + + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + @@ -596,12 +722,10 @@ class Agent: + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + - if self._greeting: + + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + - if self._failure_message: + + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + - if self._max_history is not None: + - mllm_config.setdefault("max_history", self._max_history) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + @@ -617,14 +741,14 @@ class Agent: + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + - if self._instructions: + + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + - if self._greeting: + - llm_config.setdefault("greeting_message", self._greeting) + - if self._failure_message: + - llm_config.setdefault("failure_message", self._failure_message) + + if self._greeting is not None: + + llm_config["greeting_message"] = self._greeting + + if self._failure_message is not None: + + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + - llm_config.setdefault("max_history", self._max_history) + + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index 2408659..e41a399 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -14,13 +14,16 @@ from ..agent_management.types.agent_think_agent_management_request_on_thinking_a + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + +from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + @@ -182,17 +185,29 @@ class _AgentSessionBase: + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + + if self._is_mllm_mode(): + + raise ValueError( + + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + - sample_rate = tts_params.get("sample_rate") if isinstance(tts_params, dict) else None + + sample_rate = self._agent.tts_sample_rate + + if sample_rate is None and isinstance(tts_params, dict): + + sample_rate = ( + + tts_params.get("sample_rate") + + or tts_params.get("sample_rate_hertz") + + or tts_params.get("samplingRate") + + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + @@ -211,6 +226,50 @@ class _AgentSessionBase: + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + + avatar = properties.get("avatar") + + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + + return + + + + params = avatar.get("params") + + if not isinstance(params, dict): + + params = {} + + avatar["params"] = params + + + + if is_generic_avatar(avatar): + + if not params.get("agora_appid"): + + params["agora_appid"] = self._app_id + + if not params.get("agora_channel"): + + params["agora_channel"] = self._channel + + + + if not is_rtc_avatar(avatar): + + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + + return + + + + if not params.get("agora_token"): + + if not self._app_certificate: + + raise ValueError( + + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + + ) + + token_kwargs: typing.Dict[str, typing.Any] = {} + + if self._expires_in is not None: + + token_kwargs["token_expire"] = self._expires_in + + params["agora_token"] = generate_convo_ai_token( + + app_id=self._app_id, + + app_certificate=self._app_certificate, + + channel_name=self._channel, + + account=str(params["agora_uid"]), + + **token_kwargs, + + ) + + + + if str(params.get("agora_uid")) == self._agent_uid: + + self._warn( + + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + + ) + + + + validate_avatar_config(avatar, require_session_fields=True) + + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + @@ -238,12 +297,17 @@ class _AgentSessionBase: + **token_opts, + ) + properties = self._dump_model(base_properties) + + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + - mllm = dict(self._agent.mllm) + - if self._agent.greeting: + + mllm = self._dump_model(self._agent.mllm) + + if not isinstance(mllm, dict): + + mllm = {} + + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + + if self._agent.failure_message is not None: + + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + @@ -251,20 +315,41 @@ class _AgentSessionBase: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + - if self._agent.instructions: + + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + - if self._agent.greeting: + - llm.setdefault("greeting_message", self._agent.greeting) + - if self._agent.failure_message: + - llm.setdefault("failure_message", self._agent.failure_message) + + if self._agent.greeting is not None: + + llm["greeting_message"] = self._agent.greeting + + if self._agent.failure_message is not None: + + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + - llm.setdefault("max_history", self._agent.max_history) + + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + + @staticmethod + + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + + if pagination is None: + + return None + + if isinstance(pagination, dict): + + return pagination.get(field) + + return getattr(pagination, field, None) + + + + @staticmethod + + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + + return list(turns or []) + + + + @classmethod + + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + + data = cls._dump_model(first_response) + + if not isinstance(data, dict): + + data = {} + + data["turns"] = turns + + return GetTurnsAgentsResponse(**data) + + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + @@ -484,7 +569,12 @@ class AgentSession(_AgentSessionBase): + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + - """Inject a custom text instruction into the current session pipeline.""" + + """Inject a custom text instruction into the current session pipeline. + + + + In API v2.7, omitting ``on_listening_action`` uses the server default + + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + + preserve the pre-v2.7 behavior. + + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + @@ -547,15 +637,68 @@ class AgentSession(_AgentSessionBase): + self._app_id, self._agent_id, request_options=self._request_options() + ) + + - def get_turns(self) -> typing.Any: + + def get_turns( + + self, + + *, + + page_index: typing.Optional[int] = None, + + page_size: typing.Optional[int] = None, + + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_turns( + - self._app_id, self._agent_id, request_options=self._request_options() + + self._app_id, + + self._agent_id, + + page_index=page_index, + + page_size=page_size, + + request_options=self._request_options(), + ) + + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + + """Get all turn analytics pages for this session. + + + + Raises ``RuntimeError`` if the server's pagination metadata is missing + + the fields required to advance, or if requesting the next page returns + + a page index that did not advance. + + """ + + response = self.get_turns(page_index=1, page_size=page_size) + + all_turns = self._response_turns(response) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + current_page = self._page_value(pagination, "page_index") or 1 + + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + + total_pages = self._page_value(pagination, "total_pages") + + returned_index = self._page_value(pagination, "page_index") + + if returned_index is None and total_pages is None: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + if total_pages is not None and current_page >= total_pages: + + break + + next_page = current_page + 1 + + response = self.get_turns(page_index=next_page, page_size=page_size) + + all_turns.extend(self._response_turns(response)) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + returned_index = self._page_value(pagination, "page_index") if pagination else None + + if returned_index is not None: + + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + + raise RuntimeError( + + f"get_all_turns pagination did not advance: requested page {next_page}, " + + f"received page {returned_index}." + + ) + + current_page = returned_index + + else: + + total_pages = self._page_value(pagination, "total_pages") if pagination else None + + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + + if total_pages is None and is_last_page is not True: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + current_page = next_page + + return self._with_all_turns(response, all_turns) + + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + @@ -734,7 +877,12 @@ class AsyncAgentSession(_AgentSessionBase): + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + - """Inject a custom text instruction into the current session pipeline.""" + + """Inject a custom text instruction into the current session pipeline. + + + + In API v2.7, omitting ``on_listening_action`` uses the server default + + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + + preserve the pre-v2.7 behavior. + + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + @@ -797,11 +945,64 @@ class AsyncAgentSession(_AgentSessionBase): + self._app_id, self._agent_id, request_options=self._request_options() + ) + + - async def get_turns(self) -> typing.Any: + + async def get_turns( + + self, + + *, + + page_index: typing.Optional[int] = None, + + page_size: typing.Optional[int] = None, + + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_turns( + - self._app_id, self._agent_id, request_options=self._request_options() + + self._app_id, + + self._agent_id, + + page_index=page_index, + + page_size=page_size, + + request_options=self._request_options(), + ) + + + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + + """Get all turn analytics pages for this session. + + + + Raises ``RuntimeError`` if the server's pagination metadata is missing + + the fields required to advance, or if requesting the next page returns + + a page index that did not advance. + + """ + + response = await self.get_turns(page_index=1, page_size=page_size) + + all_turns = self._response_turns(response) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + current_page = self._page_value(pagination, "page_index") or 1 + + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + + total_pages = self._page_value(pagination, "total_pages") + + returned_index = self._page_value(pagination, "page_index") + + if returned_index is None and total_pages is None: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + if total_pages is not None and current_page >= total_pages: + + break + + next_page = current_page + 1 + + response = await self.get_turns(page_index=next_page, page_size=page_size) + + all_turns.extend(self._response_turns(response)) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + returned_index = self._page_value(pagination, "page_index") if pagination else None + + if returned_index is not None: + + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + + raise RuntimeError( + + f"get_all_turns pagination did not advance: requested page {next_page}, " + + f"received page {returned_index}." + + ) + + current_page = returned_index + + else: + + total_pages = self._page_value(pagination, "total_pages") if pagination else None + + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + + if total_pages is None and is_last_page is not True: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + current_page = next_page + + return self._with_all_turns(response, all_turns) + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection + from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode + from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs + from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode + from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from .token import generate_convo_ai_token, _validate_expires_in + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + MllmVendor = StartAgentsRequestPropertiesMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + # LLM sub-type aliases + LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs + LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode + McpServersItem = StartAgentsRequestPropertiesLlmMcpServersItem + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Examples + -------- + >>> from agora_agent.agentkit import Agent + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + ): + self._name = name + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + The failure message is played via TTS when the LLM call fails. + """ + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Returns a new Agent with the specified maximum conversation history length.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + account=agent_uid, + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + if self._stt is not None: + base_kwargs["asr"] = self._stt + + return StartAgentsRequestProperties(**base_kwargs) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + account=self._agent_uid, + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_rtc_avatar(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + account=str(params["agora_uid"]), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + status: unresolved + - id: patch-eaec58eb + content_hash: sha256:8390ced175326080fc76021a97d315e71229bbc9ad70eef35a63eb9968df7830 + original_commit: eaec58eb2edfe03b1311a32dd137a867edf5d096 + original_message: "refactor(agentkit): align deprecated vendor aliases with canonical names" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/vendors/__init__.py + - src/agora_agent/agentkit/vendors/avatar.py + - src/agora_agent/agentkit/vendors/mllm.py + patch_content: |+ + diff --git a/src/agora_agent/agentkit/vendors/__init__.py b/src/agora_agent/agentkit/vendors/__init__.py + index 689eab1..8e2042e 100644 + --- a/src/agora_agent/agentkit/vendors/__init__.py + +++ b/src/agora_agent/agentkit/vendors/__init__.py + @@ -13,7 +13,7 @@ from .base import ( + ) + from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + -from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok, XaiRealtime + +from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok + from .stt import ( + AmazonSTT, + AresSTT, + @@ -83,7 +83,6 @@ __all__ = [ + "GeminiLive", + "VertexAI", + "XaiGrok", + - "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py + index 00cad8f..50bdd08 100644 + --- a/src/agora_agent/agentkit/vendors/avatar.py + +++ b/src/agora_agent/agentkit/vendors/avatar.py + @@ -5,19 +5,19 @@ from pydantic import BaseModel, ConfigDict, Field, field_validator + + from .base import BaseAvatar + + -HEYGEN_SAMPLE_RATE = 24000 + LIVEAVATAR_SAMPLE_RATE = 24000 + +HEYGEN_SAMPLE_RATE = LIVEAVATAR_SAMPLE_RATE + AKOOL_SAMPLE_RATE = 16000 + + + -class HeyGenAvatarOptions(BaseModel): + +class LiveAvatarAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + - api_key: str = Field(..., description="HeyGen API key") + + api_key: str = Field(..., description="LiveAvatar API key") + quality: str = Field(..., description="Avatar quality: low, medium, or high") + agora_uid: str = Field(..., description="Agora UID for the avatar stream") + agora_token: Optional[str] = Field(default=None, description="RTC token for avatar authentication") + - avatar_id: Optional[str] = Field(default=None, description="HeyGen avatar ID") + + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + disable_idle_timeout: Optional[bool] = Field(default=None, description="Whether to disable idle timeout") + activity_idle_timeout: Optional[int] = Field(default=None, description="Idle timeout in seconds") + @@ -31,20 +31,14 @@ class HeyGenAvatarOptions(BaseModel): + raise ValueError(f"Invalid quality '{v}'. Must be one of: {', '.join(valid)}") + return v + + -class HeyGenAvatar(BaseAvatar): + - """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + +class LiveAvatarAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + - warnings.warn( + - "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + - DeprecationWarning, + - stacklevel=2, + - ) + - self.options = HeyGenAvatarOptions(**kwargs) + + self.options = LiveAvatarAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + - return HEYGEN_SAMPLE_RATE + + return LIVEAVATAR_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + @@ -65,71 +59,79 @@ class HeyGenAvatar(BaseAvatar): + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + - return {"enable": enable, "vendor": "heygen", "params": params} + + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + -class AkoolAvatarOptions(BaseModel): + - model_config = ConfigDict(extra="forbid") + +class HeyGenAvatarOptions(LiveAvatarAvatarOptions): + + """Deprecated: use :class:`LiveAvatarAvatarOptions` instead.""" + + - api_key: str = Field(..., description="Akool API key") + - avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + - enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + - additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + -class AkoolAvatar(BaseAvatar): + +class HeyGenAvatar(BaseAvatar): + + """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + + def __init__(self, **kwargs: Any): + - self.options = AkoolAvatarOptions(**kwargs) + + warnings.warn( + + "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + + DeprecationWarning, + + stacklevel=2, + + ) + + self.options = HeyGenAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + - return AKOOL_SAMPLE_RATE + + return HEYGEN_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + + "quality": self.options.quality, + + "agora_uid": self.options.agora_uid, + } + + + if self.options.agora_token is not None: + + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + + if self.options.disable_idle_timeout is not None: + + params["disable_idle_timeout"] = self.options.disable_idle_timeout + + if self.options.activity_idle_timeout is not None: + + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + - return {"enable": enable, "vendor": "akool", "params": params} + + return {"enable": enable, "vendor": "heygen", "params": params} + + + -class LiveAvatarAvatarOptions(HeyGenAvatarOptions): + - pass + +class AkoolAvatarOptions(BaseModel): + + model_config = ConfigDict(extra="forbid") + + + + api_key: str = Field(..., description="Akool API key") + + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + -class LiveAvatarAvatar(BaseAvatar): + +class AkoolAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + - self.options = LiveAvatarAvatarOptions(**kwargs) + + self.options = AkoolAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + - return LIVEAVATAR_SAMPLE_RATE + + return AKOOL_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + - "quality": self.options.quality, + - "agora_uid": self.options.agora_uid, + } + + - if self.options.agora_token is not None: + - params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + - if self.options.disable_idle_timeout is not None: + - params["disable_idle_timeout"] = self.options.disable_idle_timeout + - if self.options.activity_idle_timeout is not None: + - params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + - return {"enable": enable, "vendor": "liveavatar", "params": params} + + return {"enable": enable, "vendor": "akool", "params": params} + + + class GenericAvatarOptions(BaseModel): + @@ -145,6 +147,7 @@ class GenericAvatarOptions(BaseModel): + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + + class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + @@ -178,10 +181,11 @@ class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anam API key") + - persona_id: Optional[str] = Field(default=None, description="Persona ID") + + persona_id: Optional[str] = Field(default=None, description="Anam persona ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + + class AnamAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AnamAvatarOptions(**kwargs) + diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py + index cd6cd07..b58f040 100644 + --- a/src/agora_agent/agentkit/vendors/mllm.py + +++ b/src/agora_agent/agentkit/vendors/mllm.py + @@ -1,4 +1,3 @@ + -import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + @@ -119,30 +118,6 @@ class XaiGrok(BaseMLLM): + return config + + + -class XaiRealtimeOptions(XaiGrokOptions): + - """Deprecated: use :class:`XaiGrokOptions` instead.""" + - + - def __init__(self, **data: Any): + - warnings.warn( + - "XaiRealtimeOptions is deprecated; use XaiGrokOptions instead.", + - DeprecationWarning, + - stacklevel=2, + - ) + - super().__init__(**data) + - + - + -class XaiRealtime(XaiGrok): + - """Deprecated: use :class:`XaiGrok` instead.""" + - + - def __init__(self, **kwargs: Any): + - warnings.warn( + - "XaiRealtime is deprecated; use XaiGrok instead.", + - DeprecationWarning, + - stacklevel=2, + - ) + - super().__init__(**kwargs) + - + - + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + theirs_snapshot: + src/agora_agent/agentkit/vendors/__init__.py: | + from .base import ( + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + ElevenLabsSampleRate, + GoogleTTSSampleRate, + MicrosoftSampleRate, + OpenAISampleRate, + SampleRate, + ) + from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok + from .stt import ( + AmazonSTT, + AresSTT, + AssemblyAISTT, + DeepgramSTT, + GoogleSTT, + MicrosoftSTT, + OpenAISTT, + SarvamSTT, + SpeechmaticsSTT, + ) + from .tts import ( + AmazonTTS, + CartesiaTTS, + DeepgramTTS, + ElevenLabsTTS, + FishAudioTTS, + GoogleTTS, + HumeAITTS, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAITTS, + RimeTTS, + SarvamTTS, + ) + + __all__ = [ + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "GoogleTTSSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + ] + src/agora_agent/agentkit/vendors/avatar.py: | + import warnings + from typing import Any, Dict, Optional + + from pydantic import BaseModel, ConfigDict, Field, field_validator + + from .base import BaseAvatar + + LIVEAVATAR_SAMPLE_RATE = 24000 + HEYGEN_SAMPLE_RATE = LIVEAVATAR_SAMPLE_RATE + AKOOL_SAMPLE_RATE = 16000 + + + class LiveAvatarAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="LiveAvatar API key") + quality: str = Field(..., description="Avatar quality: low, medium, or high") + agora_uid: str = Field(..., description="Agora UID for the avatar stream") + agora_token: Optional[str] = Field(default=None, description="RTC token for avatar authentication") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + disable_idle_timeout: Optional[bool] = Field(default=None, description="Whether to disable idle timeout") + activity_idle_timeout: Optional[int] = Field(default=None, description="Idle timeout in seconds") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + @field_validator("quality") + @classmethod + def validate_quality(cls, v: str) -> str: + valid = ("low", "medium", "high") + if v not in valid: + raise ValueError(f"Invalid quality '{v}'. Must be one of: {', '.join(valid)}") + return v + + + class LiveAvatarAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = LiveAvatarAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return LIVEAVATAR_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + class HeyGenAvatarOptions(LiveAvatarAvatarOptions): + """Deprecated: use :class:`LiveAvatarAvatarOptions` instead.""" + + + class HeyGenAvatar(BaseAvatar): + """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + def __init__(self, **kwargs: Any): + warnings.warn( + "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + DeprecationWarning, + stacklevel=2, + ) + self.options = HeyGenAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return HEYGEN_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "heygen", "params": params} + + + class AkoolAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Akool API key") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + class AkoolAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AkoolAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return AKOOL_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "akool", "params": params} + + + class GenericAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Generic avatar provider API key") + api_base_url: str = Field(..., description="Avatar provider API base URL") + avatar_id: str = Field(..., description="Avatar ID") + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "api_base_url": self.options.api_base_url, + "avatar_id": self.options.avatar_id, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_appid is not None: + params["agora_appid"] = self.options.agora_appid + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.agora_channel is not None: + params["agora_channel"] = self.options.agora_channel + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "generic", "params": params} + + + class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anam API key") + persona_id: Optional[str] = Field(default=None, description="Anam persona ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + class AnamAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AnamAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.persona_id is not None: + params["persona_id"] = self.options.persona_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "anam", "params": params} + src/agora_agent/agentkit/vendors/mllm.py: | + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + + from ...agents.types.start_agents_request_properties_mllm_turn_detection import ( + StartAgentsRequestPropertiesMllmTurnDetection, + ) + from .base import BaseMLLM + + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + + + class OpenAIRealtimeOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="OpenAI API key") + model: Optional[str] = Field(default=None, description="Model name (e.g., gpt-4o-realtime-preview)") + url: Optional[str] = Field(default=None, description="WebSocket URL") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIRealtimeOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config: Dict[str, Any] = { + "vendor": "openai", + "api_key": self.options.api_key, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.model is not None: + params = {"model": self.options.model} + if self.options.params is not None: + params.update(self.options.params) + config["params"] = params + elif self.options.params is not None: + config["params"] = self.options.params + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + # xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + # is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + class XaiGrokOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="xAI API key") + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + class XaiGrok(BaseMLLM): + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + def __init__(self, **kwargs: Any): + self.options = XaiGrokOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.language is not None: + params["language"] = self.options.language + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate + + config: Dict[str, Any] = { + "vendor": "xai", + "api_key": self.options.api_key, + "url": self.options.url, + "params": params, + } + + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + model: str = Field(..., description="Model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud location/region") + adc_credentials_string: str = Field(..., description="Application Default Credentials JSON string") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name (e.g., Aoede, Charon)") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # additional_params spread first so that explicit fields always win, + # matching the TypeScript SDK. + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["model"] = self.options.model + params["project_id"] = self.options.project_id + params["location"] = self.options.location + params["adc_credentials_string"] = self.options.adc_credentials_string + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "vertexai", + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class GeminiLiveOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google API key") + model: str = Field(..., description="Gemini Live model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiLiveOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = {} + if self.options.additional_params is not None: + params.update(self.options.additional_params) + params["model"] = self.options.model + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "gemini", + "api_key": self.options.api_key, + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + status: unresolved + - id: patch-20245632 + content_hash: sha256:a22e4a3b114ba8105c8129ccd6222570dc1f231daf9ac6037a00bcd4e11c425b + original_commit: 20245632afd066efe5a453665b29c5ba0e13e4f8 + original_message: "feat(agentkit): export type aliases and avatar token helpers" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/__init__.py + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + - src/agora_agent/agentkit/avatar_types.py + - src/agora_agent/agentkit/constants.py + patch_content: |+ + From 20245632afd066efe5a453665b29c5ba0e13e4f8 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Thu, 21 May 2026 15:17:27 -0400 + Subject: [PATCH] feat(agentkit): export type aliases and avatar token helpers + + --- + src/agora_agent/agentkit/__init__.py | 49 ++++++++++++++++------- + src/agora_agent/agentkit/agent.py | 22 +++++++++- + src/agora_agent/agentkit/agent_session.py | 8 +++- + src/agora_agent/agentkit/avatar_types.py | 23 +++++++++-- + src/agora_agent/agentkit/constants.py | 10 +++++ + 5 files changed, 90 insertions(+), 22 deletions(-) + + diff --git a/src/agora_agent/agentkit/__init__.py b/src/agora_agent/agentkit/__init__.py + index 5ceda66..e9ab221 100644 + --- a/src/agora_agent/agentkit/__init__.py + +++ b/src/agora_agent/agentkit/__init__.py + @@ -2,6 +2,7 @@ from .agent import ( + Agent, + AgentConfig, + AgentConfigUpdate, + + AsrConfig, + ConversationHistory, + ConversationRole, + ConversationSessionTurn, + @@ -62,23 +63,23 @@ from .agent import ( + SessionListResponse, + SessionSummary, + SpeakPriority, + + ThinkOnListeningAction, + + ThinkOnSpeakingAction, + + ThinkOnThinkingAction, + + ThinkResponse, + ) + -from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + -from ..agent_management.types.agent_think_agent_management_response import ( + - AgentThinkAgentManagementResponse as AgentThinkResponse, + -) + -from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + - AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + -) + -from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + - AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + -) + -from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + - AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + +# Deprecated think type aliases (prefer ThinkOn* names). + +from .agent import ( + + ThinkOnListeningAction as AgentThinkRequestOnListeningAction, + + ThinkOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + + ThinkOnThinkingAction as AgentThinkRequestOnThinkingAction, + + ThinkResponse as AgentThinkResponse, + ) + +from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + @@ -94,6 +95,13 @@ from .constants import ( + GeofenceArea, + GeofenceExcludeArea, + FillerWordsSelectionRule, + + ThinkOnListeningActionIgnore, + + ThinkOnListeningActionInject, + + ThinkOnListeningActionInterrupt, + + ThinkOnSpeakingActionIgnore, + + ThinkOnSpeakingActionInterrupt, + + ThinkOnThinkingActionIgnore, + + ThinkOnThinkingActionInterrupt, + TurnDetectionTypeValues, + ) + from .token import ( + @@ -158,7 +166,6 @@ from .vendors import ( + SpeechmaticsSTT, + VertexAI, + XaiGrok, + - XaiRealtime, + LiveAvatarAvatar, + ) + + @@ -172,6 +179,7 @@ __all__ = [ + "LlmConfig", + "LlmStyle", + "SttConfig", + + "AsrConfig", + "SttVendor", + "TtsConfig", + "MllmConfig", + @@ -230,6 +238,13 @@ __all__ = [ + "GeofenceExcludeArea", + "FillerWordsSelectionRule", + "TurnDetectionTypeValues", + + "ThinkOnListeningActionInject", + + "ThinkOnListeningActionInterrupt", + + "ThinkOnListeningActionIgnore", + + "ThinkOnThinkingActionInterrupt", + + "ThinkOnThinkingActionIgnore", + + "ThinkOnSpeakingActionInterrupt", + + "ThinkOnSpeakingActionIgnore", + # LLM sub-types + "LlmGreetingConfigs", + "LlmGreetingConfigsMode", + @@ -246,10 +261,16 @@ __all__ = [ + "ConversationTurns", + "ConversationSessionTurn", + "SpeakPriority", + + "ThinkResponse", + + "ThinkOnListeningAction", + + "ThinkOnThinkingAction", + + "ThinkOnSpeakingAction", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + "AgentThinkRequestOnSpeakingAction", + + "is_avatar_token_managed", + + "is_rtc_avatar", + "AgentPresets", + "DeepgramPresetModels", + "OpenAIPresetModels", + @@ -303,7 +324,6 @@ __all__ = [ + "GeminiLive", + "VertexAI", + "XaiGrok", + - "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + @@ -314,7 +334,6 @@ __all__ = [ + "is_akool_avatar", + "is_anam_avatar", + "is_generic_avatar", + - "is_rtc_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index 86a958e..14933a2 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -66,13 +66,25 @@ from ..agents.types.start_agents_request_properties_filler_words_content import + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + -from .token import generate_convo_ai_token, _validate_expires_in + +from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + + AgentThinkAgentManagementRequestOnListeningAction, + +) + +from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + + AgentThinkAgentManagementRequestOnThinkingAction, + +) + +from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + + AgentThinkAgentManagementRequestOnSpeakingAction, + +) + +from ..agent_management.types.agent_think_agent_management_response import ( + + AgentThinkAgentManagementResponse, + +) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + +AsrConfig = SttConfig + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + @@ -159,6 +171,14 @@ FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + +# Think type aliases and response + +ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + +ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + +ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + +ThinkResponse = AgentThinkAgentManagementResponse + + + +from .token import generate_convo_ai_token, _validate_expires_in + + + + class Agent: + """A reusable agent definition. + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index e41a399..269619e 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -20,10 +20,10 @@ from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + - is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + @@ -242,7 +242,11 @@ class _AgentSessionBase: + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + - if not is_rtc_avatar(avatar): + + if not is_avatar_token_managed(avatar): + + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + + return + + + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + diff --git a/src/agora_agent/agentkit/avatar_types.py b/src/agora_agent/agentkit/avatar_types.py + index a04809c..aea9da1 100644 + --- a/src/agora_agent/agentkit/avatar_types.py + +++ b/src/agora_agent/agentkit/avatar_types.py + @@ -1,3 +1,4 @@ + +import warnings + import typing + + + @@ -21,11 +22,25 @@ def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "generic" + + + +def is_avatar_token_managed(config: typing.Dict[str, typing.Any]) -> bool: + + """Return True when AgentKit manages the avatar RTC publisher identity.""" + + return ( + + is_heygen_avatar(config) + + or is_live_avatar_avatar(config) + + or is_generic_avatar(config) + + ) + + + + + def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + - params = config.get("params", {}) + - return isinstance(params, dict) and bool(params.get("agora_uid")) and ( + - is_heygen_avatar(config) or is_live_avatar_avatar(config) or is_generic_avatar(config) + + """Deprecated: use :func:`is_avatar_token_managed` for vendor gating.""" + + warnings.warn( + + "is_rtc_avatar is deprecated; use is_avatar_token_managed for vendor gating " + + "and keep agora_uid checks in session enrichment.", + + DeprecationWarning, + + stacklevel=2, + ) + + params = config.get("params", {}) + + return isinstance(params, dict) and bool(params.get("agora_uid")) and is_avatar_token_managed(config) + + + def validate_avatar_config( + @@ -95,7 +110,7 @@ def validate_tts_sample_rate( + """Validates that TTS sample rate is compatible with the avatar vendor. + + Different avatar vendors have specific sample rate requirements: + - - HeyGen: ONLY supports 24,000 Hz + + - HeyGen/LiveAvatar: ONLY supports 24,000 Hz + - Akool: ONLY supports 16,000 Hz + + Parameters + diff --git a/src/agora_agent/agentkit/constants.py b/src/agora_agent/agentkit/constants.py + index f86e4d3..c0a852e 100644 + --- a/src/agora_agent/agentkit/constants.py + +++ b/src/agora_agent/agentkit/constants.py + @@ -58,3 +58,13 @@ class TurnDetectionTypeValues: + AGORA_VAD = "agora_vad" + SERVER_VAD = "server_vad" + SEMANTIC_VAD = "semantic_vad" + + + + + +# Think action value constants (match Fern wire values) + +ThinkOnListeningActionInject = "inject" + +ThinkOnListeningActionInterrupt = "interrupt" + +ThinkOnListeningActionIgnore = "ignore" + +ThinkOnThinkingActionInterrupt = "interrupt" + +ThinkOnThinkingActionIgnore = "ignore" + +ThinkOnSpeakingActionInterrupt = "interrupt" + +ThinkOnSpeakingActionIgnore = "ignore" + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/__init__.py: | + from .agent import ( + Agent, + AgentConfig, + AgentConfigUpdate, + AsrConfig, + ConversationHistory, + ConversationRole, + ConversationSessionTurn, + ConversationTurn, + ConversationTurns, + StartAgentsRequestProperties, + AvatarConfig, + AvatarVendor, + GeofenceConfig, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, + RtcConfig, + SttConfig, + SttVendor, + TtsConfig, + FillerWordsConfig, + FillerWordsTrigger, + FillerWordsTriggerFixedTimeConfig, + FillerWordsContent, + FillerWordsContentStaticConfig, + FillerWordsContentSelectionRule, + TurnDetectionConfig, + TurnDetectionNestedConfig, + StartOfSpeechConfig, + StartOfSpeechMode, + StartOfSpeechVadConfig, + StartOfSpeechKeywordsConfig, + StartOfSpeechDisabledConfig, + StartOfSpeechDisabledConfigStrategy, + EndOfSpeechConfig, + EndOfSpeechMode, + EndOfSpeechVadConfig, + EndOfSpeechSemanticConfig, + TurnDetectionType, + InterruptMode, + Eagerness, + SalConfig, + SalMode, + AdvancedFeatures, + SessionParams, + SessionParamsInput, + SilenceConfig, + SilenceAction, + FarewellConfig, + ParametersDataChannel, + ParametersAudioScenario, + InterruptionConfig, + InterruptionMode, + MllmTurnDetectionConfig, + MllmTurnDetectionMode, + Labels, + LlmGreetingConfigs, + LlmGreetingConfigsMode, + McpServersItem, + SessionInfo, + SessionListResponse, + SessionSummary, + SpeakPriority, + ThinkOnListeningAction, + ThinkOnSpeakingAction, + ThinkOnThinkingAction, + ThinkResponse, + ) + # Deprecated think type aliases (prefer ThinkOn* names). + from .agent import ( + ThinkOnListeningAction as AgentThinkRequestOnListeningAction, + ThinkOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ThinkOnThinkingAction as AgentThinkRequestOnThinkingAction, + ThinkResponse as AgentThinkResponse, + ) + from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .constants import ( + DataChannel, + AudioScenario, + SilenceActionValues, + SalModeValues, + GeofenceArea, + GeofenceExcludeArea, + FillerWordsSelectionRule, + ThinkOnListeningActionIgnore, + ThinkOnListeningActionInject, + ThinkOnListeningActionInterrupt, + ThinkOnSpeakingActionIgnore, + ThinkOnSpeakingActionInterrupt, + ThinkOnThinkingActionIgnore, + ThinkOnThinkingActionInterrupt, + TurnDetectionTypeValues, + ) + from .token import ( + GenerateConvoAITokenOptions, + GenerateTokenOptions, + MAX_EXPIRY_SECONDS, + generate_convo_ai_token, + generate_rtc_token, + expires_in_hours, + expires_in_minutes, + ) + from .presets import ( + AgentPresets, + DeepgramPresetModels, + MiniMaxPresetModels, + OpenAIPresetModels, + OpenAITtsPresetModels, + normalize_preset_input, + ) + from .vendors import ( + AkoolAvatar, + AmazonSTT, + AmazonTTS, + AnamAvatar, + Anthropic, + AresSTT, + AssemblyAISTT, + AzureOpenAI, + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + CartesiaTTS, + DeepgramSTT, + DeepgramTTS, + ElevenLabsSampleRate, + ElevenLabsTTS, + FishAudioTTS, + Gemini, + GeminiLive, + GenericAvatar, + GoogleSTT, + GoogleTTS, + HeyGenAvatar, + HumeAITTS, + MicrosoftSampleRate, + MicrosoftSTT, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAI, + OpenAIRealtime, + OpenAISampleRate, + OpenAISTT, + OpenAITTS, + RimeTTS, + SampleRate, + SarvamSTT, + SarvamTTS, + SpeechmaticsSTT, + VertexAI, + XaiGrok, + LiveAvatarAvatar, + ) + + __all__ = [ + "Agent", + "AgentConfig", + "AgentConfigUpdate", + # Return type of Agent.to_properties() + "StartAgentsRequestProperties", + # Top-level config types + "LlmConfig", + "LlmStyle", + "SttConfig", + "AsrConfig", + "SttVendor", + "TtsConfig", + "MllmConfig", + "MllmVendor", + "AvatarConfig", + "AvatarVendor", + "GeofenceConfig", + "RtcConfig", + "FillerWordsConfig", + "FillerWordsTrigger", + "FillerWordsTriggerFixedTimeConfig", + "FillerWordsContent", + "FillerWordsContentStaticConfig", + "FillerWordsContentSelectionRule", + # Turn detection types + "TurnDetectionConfig", + "TurnDetectionNestedConfig", + "StartOfSpeechConfig", + "StartOfSpeechMode", + "StartOfSpeechVadConfig", + "StartOfSpeechKeywordsConfig", + "StartOfSpeechDisabledConfig", + "StartOfSpeechDisabledConfigStrategy", + "EndOfSpeechConfig", + "EndOfSpeechMode", + "EndOfSpeechVadConfig", + "EndOfSpeechSemanticConfig", + # Deprecated turn detection types + "TurnDetectionType", + "InterruptMode", + "Eagerness", + # SAL types + "SalConfig", + "SalMode", + # Advanced features + "AdvancedFeatures", + # Session parameters types + "SessionParams", + "SessionParamsInput", + "SilenceConfig", + "SilenceAction", + "FarewellConfig", + "ParametersDataChannel", + "ParametersAudioScenario", + "InterruptionConfig", + "InterruptionMode", + "MllmTurnDetectionConfig", + "MllmTurnDetectionMode", + "Labels", + # Type-safe constants + "DataChannel", + "AudioScenario", + "SilenceActionValues", + "SalModeValues", + "GeofenceArea", + "GeofenceExcludeArea", + "FillerWordsSelectionRule", + "TurnDetectionTypeValues", + "ThinkOnListeningActionInject", + "ThinkOnListeningActionInterrupt", + "ThinkOnListeningActionIgnore", + "ThinkOnThinkingActionInterrupt", + "ThinkOnThinkingActionIgnore", + "ThinkOnSpeakingActionInterrupt", + "ThinkOnSpeakingActionIgnore", + # LLM sub-types + "LlmGreetingConfigs", + "LlmGreetingConfigsMode", + "McpServersItem", + "AgentSession", + "AsyncAgentSession", + "AgentSessionOptions", + "SessionInfo", + "SessionListResponse", + "SessionSummary", + "ConversationHistory", + "ConversationTurn", + "ConversationRole", + "ConversationTurns", + "ConversationSessionTurn", + "SpeakPriority", + "ThinkResponse", + "ThinkOnListeningAction", + "ThinkOnThinkingAction", + "ThinkOnSpeakingAction", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + "AgentThinkRequestOnSpeakingAction", + "is_avatar_token_managed", + "is_rtc_avatar", + "AgentPresets", + "DeepgramPresetModels", + "OpenAIPresetModels", + "OpenAITtsPresetModels", + "MiniMaxPresetModels", + "normalize_preset_input", + "generate_rtc_token", + "GenerateTokenOptions", + "generate_convo_ai_token", + "GenerateConvoAITokenOptions", + "MAX_EXPIRY_SECONDS", + "expires_in_hours", + "expires_in_minutes", + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + "is_heygen_avatar", + "is_live_avatar_avatar", + "is_akool_avatar", + "is_anam_avatar", + "is_generic_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection + from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode + from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs + from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode + from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse, + ) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + AsrConfig = SttConfig + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + MllmVendor = StartAgentsRequestPropertiesMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + # LLM sub-type aliases + LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs + LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode + McpServersItem = StartAgentsRequestPropertiesLlmMcpServersItem + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + # Think type aliases and response + ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + ThinkResponse = AgentThinkAgentManagementResponse + + from .token import generate_convo_ai_token, _validate_expires_in + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Examples + -------- + >>> from agora_agent.agentkit import Agent + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + ): + self._name = name + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + The failure message is played via TTS when the LLM call fails. + """ + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Returns a new Agent with the specified maximum conversation history length.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + account=agent_uid, + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + if self._stt is not None: + base_kwargs["asr"] = self._stt + + return StartAgentsRequestProperties(**base_kwargs) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + account=self._agent_uid, + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + account=str(params["agora_uid"]), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + src/agora_agent/agentkit/avatar_types.py: | + import warnings + import typing + + + def is_heygen_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "heygen" + + + def is_live_avatar_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "liveavatar" + + + def is_akool_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "akool" + + + def is_anam_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "anam" + + + def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "generic" + + + def is_avatar_token_managed(config: typing.Dict[str, typing.Any]) -> bool: + """Return True when AgentKit manages the avatar RTC publisher identity.""" + return ( + is_heygen_avatar(config) + or is_live_avatar_avatar(config) + or is_generic_avatar(config) + ) + + + def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + """Deprecated: use :func:`is_avatar_token_managed` for vendor gating.""" + warnings.warn( + "is_rtc_avatar is deprecated; use is_avatar_token_managed for vendor gating " + "and keep agora_uid checks in session enrichment.", + DeprecationWarning, + stacklevel=2, + ) + params = config.get("params", {}) + return isinstance(params, dict) and bool(params.get("agora_uid")) and is_avatar_token_managed(config) + + + def validate_avatar_config( + config: typing.Dict[str, typing.Any], + require_session_fields: bool = False, + ) -> None: + """Validates avatar configuration at runtime. + + Parameters + ---------- + config : dict + The avatar configuration dictionary. + + Raises + ------ + ValueError + If the configuration is invalid. + """ + if is_heygen_avatar(config) or is_live_avatar_avatar(config): + label = "HeyGen" if is_heygen_avatar(config) else "LiveAvatar" + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError(f"{label} avatar requires api_key") + if not params.get("quality"): + raise ValueError(f"{label} avatar requires quality (low, medium, or high)") + if not params.get("agora_uid"): + raise ValueError(f"{label} avatar requires agora_uid") + valid_qualities = ("low", "medium", "high") + if params.get("quality") not in valid_qualities: + raise ValueError( + f"Invalid quality for {label}: {params.get('quality')}. " + f"Must be one of: {', '.join(valid_qualities)}" + ) + if require_session_fields and not params.get("agora_token"): + raise ValueError(f"{label} avatar requires agora_token after session enrichment") + elif is_akool_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Akool avatar requires api_key") + elif is_anam_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Anam avatar requires api_key") + elif is_generic_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Generic avatar requires api_key") + if not params.get("api_base_url"): + raise ValueError("Generic avatar requires api_base_url") + if not params.get("avatar_id"): + raise ValueError("Generic avatar requires avatar_id") + if not params.get("agora_uid"): + raise ValueError("Generic avatar requires agora_uid") + if require_session_fields: + if not params.get("agora_token"): + raise ValueError("Generic avatar requires agora_token after session enrichment") + if not params.get("agora_appid"): + raise ValueError("Generic avatar requires agora_appid after session enrichment") + if not params.get("agora_channel"): + raise ValueError("Generic avatar requires agora_channel after session enrichment") + + + def validate_tts_sample_rate( + avatar_config: typing.Dict[str, typing.Any], + tts_sample_rate: int, + ) -> None: + """Validates that TTS sample rate is compatible with the avatar vendor. + + Different avatar vendors have specific sample rate requirements: + - HeyGen/LiveAvatar: ONLY supports 24,000 Hz + - Akool: ONLY supports 16,000 Hz + + Parameters + ---------- + avatar_config : dict + The avatar configuration dictionary. + tts_sample_rate : int + The sample rate from your TTS configuration (in Hz). + + Raises + ------ + ValueError + If TTS sample rate is incompatible with the avatar vendor. + """ + if is_heygen_avatar(avatar_config) or is_live_avatar_avatar(avatar_config): + if tts_sample_rate != 24000: + label = "HeyGen" if is_heygen_avatar(avatar_config) else "LiveAvatar" + raise ValueError( + f"{label} avatars ONLY support 24,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 24kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/overview" + ) + elif is_akool_avatar(avatar_config): + if tts_sample_rate != 16000: + raise ValueError( + f"Akool avatars ONLY support 16,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 16kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/akool" + ) + src/agora_agent/agentkit/constants.py: | + """ + Type-safe constants for agent configuration values. + Use these instead of raw strings to avoid typos and get IDE autocomplete. + """ + + # Data channel: "rtm" | "datastream" + class DataChannel: + RTM = "rtm" + DATASTREAM = "datastream" + + class AudioScenario: + DEFAULT = "default" + CHORUS = "chorus" + AISERVER = "aiserver" + + + # Silence action when timeout elapses: "speak" | "think" + # (Use for parameters.silence_config.action — avoids shadowing SilenceAction type) + class SilenceActionValues: + SPEAK = "speak" + THINK = "think" + + + # SAL mode: "locking" | "recognition" + # (Use for sal.sal_mode — avoids shadowing SalMode type) + class SalModeValues: + LOCKING = "locking" + RECOGNITION = "recognition" + + + # Geofence area: "GLOBAL" | "NORTH_AMERICA" | "EUROPE" | "ASIA" | "INDIA" | "JAPAN" + class GeofenceArea: + GLOBAL = "GLOBAL" + NORTH_AMERICA = "NORTH_AMERICA" + EUROPE = "EUROPE" + ASIA = "ASIA" + INDIA = "INDIA" + JAPAN = "JAPAN" + + + # Geofence exclude area (when area is GLOBAL) + class GeofenceExcludeArea: + NORTH_AMERICA = "NORTH_AMERICA" + EUROPE = "EUROPE" + ASIA = "ASIA" + INDIA = "INDIA" + JAPAN = "JAPAN" + + + # Filler word selection rule: "shuffle" | "round_robin" + class FillerWordsSelectionRule: + SHUFFLE = "shuffle" + ROUND_ROBIN = "round_robin" + + + # Turn detection type (deprecated; use TurnDetectionNestedConfig.EndOfSpeech instead) + class TurnDetectionTypeValues: + AGORA_VAD = "agora_vad" + SERVER_VAD = "server_vad" + SEMANTIC_VAD = "semantic_vad" + + + # Think action value constants (match Fern wire values) + ThinkOnListeningActionInject = "inject" + ThinkOnListeningActionInterrupt = "interrupt" + ThinkOnListeningActionIgnore = "ignore" + ThinkOnThinkingActionInterrupt = "interrupt" + ThinkOnThinkingActionIgnore = "ignore" + ThinkOnSpeakingActionInterrupt = "interrupt" + ThinkOnSpeakingActionIgnore = "ignore" + status: unresolved + - id: patch-972dd5bd + content_hash: sha256:10f86db20e0b5a3800efce4913b736ff338dee29eb18cb31e89658e0293b848e + original_commit: 972dd5bdafc09b3981ab2ce4e0d02beae165c626 + original_message: updated docs + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/reference/agent.md + patch_content: |+ + From 972dd5bdafc09b3981ab2ce4e0d02beae165c626 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Thu, 21 May 2026 16:13:35 -0400 + Subject: [PATCH] updated docs + + --- + docs/reference/agent.md | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + + diff --git a/docs/reference/agent.md b/docs/reference/agent.md + index 1e88b8b..3163f9c 100644 + --- a/docs/reference/agent.md + +++ b/docs/reference/agent.md + @@ -264,3 +264,18 @@ to_properties( + | `rtc` | `Optional[RtcConfig]` | RTC configuration | + | `filler_words` | `Optional[FillerWordsConfig]` | Filler words configuration | + | `config` | `Dict[str, Any]` | Full configuration dict | + + + +## Type aliases + + + +Public aliases over Fern-generated types: `LlmConfig`, `SttConfig`, `AsrConfig` (= `SttConfig`), `MllmConfig`, `AvatarConfig`, session/conversation types, and think types (`ThinkOnListeningAction`, etc.). + + + +Think value constants: `ThinkOnListeningActionInject`, `ThinkOnListeningActionInterrupt`, `ThinkOnListeningActionIgnore`, `ThinkOnThinkingActionInterrupt`, `ThinkOnThinkingActionIgnore`, `ThinkOnSpeakingActionInterrupt`, `ThinkOnSpeakingActionIgnore`. + + + +## Cross-SDK discovery map + + + +| Concept | Python | TypeScript | Go | + +|---|---|---|---| + +| STT payload alias (wire: `asr`) | `SttConfig` / `AsrConfig` | `SttConfig` / `AsrConfig` | `AsrConfig` / `SttConfig` | + +| xAI MLLM (primary) | `XaiGrok` | `XaiGrok` | `XaiGrok` / `NewXaiGrok` | + +| Avatar token helper | `is_avatar_token_managed` | `isAvatarTokenManaged` | `IsAvatarTokenManaged` | + +| Think inject constant | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | + -- + 2.52.0 + + theirs_snapshot: + docs/reference/agent.md: | + --- + sidebar_position: 2 + title: Agent + description: Full API reference for the Python Agent builder class. + --- + + # Agent Reference + + **Import:** `from agora_agent.agentkit import Agent` or `from agora_agent import Agent` + + ## Constructor + + + ```python + Agent( + name: Optional[str] = None, + instructions: Optional[str] = None, + turn_detection: Optional[TurnDetectionConfig] = None, + interruption: Optional[InterruptionConfig] = None, + sal: Optional[SalConfig] = None, + advanced_features: Optional[Dict[str, Any]] = None, + parameters: Optional[SessionParams] = None, + greeting: Optional[str] = None, + failure_message: Optional[str] = None, + max_history: Optional[int] = None, + geofence: Optional[GeofenceConfig] = None, + labels: Optional[Dict[str, str]] = None, + rtc: Optional[RtcConfig] = None, + filler_words: Optional[FillerWordsConfig] = None, + ) + ``` + + | Parameter | Type | Default | Description | + |---|---|---|---| + | `name` | `Optional[str]` | `None` | Agent name, used as default session name | + | `instructions` | `Optional[str]` | `None` | System prompt for the LLM | + | `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Turn detection configuration | + | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | + | `sal` | `Optional[SalConfig]` | `None` | Speech Activity Level configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | `None` | Advanced features dict (e.g., `{'enable_rtm': True}`) | + | `parameters` | `Optional[SessionParams]` | `None` | Additional session parameters | + | `greeting` | `Optional[str]` | `None` | Auto-spoken greeting when agent joins | + | `failure_message` | `Optional[str]` | `None` | Spoken on error | + | `max_history` | `Optional[int]` | `None` | Max conversation history length | + | `geofence` | `Optional[GeofenceConfig]` | `None` | Regional access restriction | + | `labels` | `Optional[Dict[str, str]]` | `None` | Custom key-value labels (returned in callbacks) | + | `rtc` | `Optional[RtcConfig]` | `None` | RTC media encryption | + | `filler_words` | `Optional[FillerWordsConfig]` | `None` | Filler words while waiting for LLM | + + ## Builder Methods + + All builder methods return a new `Agent` instance (immutable pattern). + + ### `with_llm(vendor: BaseLLM) -> Agent` + + Set the LLM vendor for cascading flow. + + + ```python + from agora_agent.agentkit.vendors import OpenAI + agent = Agent().with_llm(OpenAI(api_key='your-key', model='gpt-4o-mini')) + ``` + + ### `with_tts(vendor: BaseTTS) -> Agent` + + Set the TTS vendor. Records the vendor's `sample_rate` for avatar validation. + + + ```python + from agora_agent.agentkit.vendors import ElevenLabsTTS + agent = Agent().with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id')) + ``` + + ### `with_stt(vendor: BaseSTT) -> Agent` + + Set the STT (ASR) vendor. + + + ```python + from agora_agent.agentkit.vendors import DeepgramSTT + agent = Agent().with_stt(DeepgramSTT(api_key='your-key', language='en-US')) + ``` + + ### `with_mllm(vendor: BaseMLLM) -> Agent` + + Set the MLLM vendor for multimodal flow. Calling `with_mllm()` automatically sets `mllm.enable = True`. MLLM sessions do not require TTS, STT, or LLM vendors. + + + ```python + from agora_agent.agentkit.vendors import OpenAIRealtime + agent = Agent().with_mllm(OpenAIRealtime(api_key='your-key')) + ``` + + ### `with_avatar(vendor: BaseAvatar) -> Agent` + + Set the avatar vendor for the cascading ASR + LLM + TTS pipeline. Avatars are not supported when MLLM is enabled — combining `with_mllm()` and an enabled `with_avatar()` is rejected at `to_properties()` and `AgentSession.start()`. A disabled avatar (`enable=False`) is allowed alongside MLLM. + + Raises `ValueError` if the TTS sample rate does not match the avatar's `required_sample_rate`. + + + ```python + from agora_agent.agentkit.vendors import HeyGenAvatar + agent = agent.with_avatar(HeyGenAvatar(api_key='your-key', quality='medium', agora_uid='2')) + ``` + + **Raises:** `ValueError` — `"Avatar requires TTS sample rate of {required} Hz, but TTS is configured with {actual} Hz. Please update your TTS sample_rate to {required}."` + + ### `with_turn_detection(config: TurnDetectionConfig) -> Agent` + + Override cascading-flow turn detection settings. Use `config.start_of_speech` and `config.end_of_speech` for SOS/EOS detection. Use `with_interruption()` for interruption behavior and MLLM vendor `turn_detection` for MLLM turn detection. + + Pause-state detection is configured under semantic end-of-speech: + + ```python + agent = agent.with_turn_detection({ + "mode": "default", + "config": { + "end_of_speech": { + "mode": "semantic", + "semantic_config": { + "pause_state_enabled": True, + }, + }, + }, + }) + ``` + + ### `with_interruption(config: InterruptionConfig) -> Agent` + + Configure unified interruption behavior using the top-level `interruption` object. Use this for `start_of_speech` and `keywords` interruption modes. + + ### `with_instructions(instructions: str) -> Agent` + + Override the system prompt. + + ### `with_greeting(greeting: str) -> Agent` + + Override the greeting message. + + ### `with_name(name: str) -> Agent` + + Override the agent name. + + ### `with_sal(config: SalConfig) -> Agent` + + Set SAL (Selective Attention Locking) configuration. + + ### `with_advanced_features(features: AdvancedFeatures) -> Agent` + + Set advanced features (e.g. `{'enable_rtm': True}`). + + When `enable_rtm=True`, AgentKit defaults `parameters.data_channel` to `"rtm"` unless you explicitly set another data channel. + + ### `with_tools(enabled: bool = True) -> Agent` + + Enable or disable MCP tool invocation by setting `advanced_features.enable_tools`. + + ### `with_parameters(parameters: SessionParams) -> Agent` + + Set session parameters (silence config, farewell config, data channel, audio scenario, etc.). + + ### `with_audio_scenario(audio_scenario: ParametersAudioScenario) -> Agent` + + Set `parameters.audio_scenario` without replacing existing session parameters. + + ### `with_failure_message(message: str) -> Agent` + + Set the message spoken via TTS when the LLM call fails. + + ### `with_max_history(max_history: int) -> Agent` + + Set the maximum conversation history length for the standard ASR + LLM + TTS pipeline. The v2.7 MLLM core type does not expose `max_history`. + + ### `with_geofence(geofence: GeofenceConfig) -> Agent` + + Set geofence configuration (restricts backend server regions). + + ### `with_labels(labels: Dict[str, str]) -> Agent` + + Set custom labels (key-value pairs returned in notification callbacks). + + ### `with_rtc(rtc: RtcConfig) -> Agent` + + Set RTC configuration. + + ### `with_filler_words(filler_words: FillerWordsConfig) -> Agent` + + Set filler words configuration (played while waiting for LLM response). + + ## `create_session()` + + + ```python + create_session( + client: Any, + channel: str, + agent_uid: str, + remote_uids: List[str], + name: Optional[str] = None, + token: Optional[str] = None, + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + expires_in: Optional[int] = None, + ) -> AgentSession + ``` + + Creates an `AgentSession` bound to the given client and channel. + + | Parameter | Type | Required | Description | + |---|---|---|---| + | `client` | `Agora` or `AsyncAgora` | Yes | Authenticated client | + | `channel` | `str` | Yes | Channel name | + | `agent_uid` | `str` | Yes | UID for the agent | + | `remote_uids` | `List[str]` | Yes | UIDs of remote participants | + | `name` | `Optional[str]` | No | Session name (defaults to agent name) | + | `token` | `Optional[str]` | No | Pre-built RTC+RTM token | + | `expires_in` | `Optional[int]` | No | Token lifetime in seconds (default: `86400` = 24 h, Agora max). Only applies when the token is auto-generated. Use `expires_in_hours()` or `expires_in_minutes()` for clarity. Valid range: 1–86400. | + | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | + | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | + + **Returns:** `AgentSession` + + ## `to_properties()` + + Converts the agent configuration into a `StartAgentsRequestProperties` object for the Agora API. Called internally by `AgentSession.start()`. + + + ```python + to_properties( + channel: str, + agent_uid: str, + remote_uids: List[str], + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + token: Optional[str] = None, + app_id: Optional[str] = None, + app_certificate: Optional[str] = None, + expires_in: Optional[int] = None, + ) -> StartAgentsRequestProperties + ``` + + **Raises:** `ValueError` if neither `token` nor `app_id`+`app_certificate` is provided, or if required vendors (LLM, TTS) are missing in cascading mode. + + ## Properties + + | Property | Type | Description | + |---|---|---| + | `name` | `Optional[str]` | Agent name | + | `instructions` | `Optional[str]` | System prompt | + | `greeting` | `Optional[str]` | Greeting message | + | `failure_message` | `Optional[str]` | Message spoken when LLM fails | + | `max_history` | `Optional[int]` | Max conversation history length | + | `llm` | `Optional[Dict[str, Any]]` | LLM config dict (from `to_config()`) | + | `tts` | `Optional[Dict[str, Any]]` | TTS config dict | + | `stt` | `Optional[Dict[str, Any]]` | STT config dict | + | `mllm` | `Optional[Dict[str, Any]]` | MLLM config dict | + | `avatar` | `Optional[Dict[str, Any]]` | Avatar config dict | + | `turn_detection` | `Optional[TurnDetectionConfig]` | Turn detection settings | + | `sal` | `Optional[SalConfig]` | SAL configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | Advanced features | + | `parameters` | `Optional[SessionParams]` | Session parameters | + | `geofence` | `Optional[GeofenceConfig]` | Geofence configuration | + | `labels` | `Optional[Dict[str, str]]` | Custom labels | + | `rtc` | `Optional[RtcConfig]` | RTC configuration | + | `filler_words` | `Optional[FillerWordsConfig]` | Filler words configuration | + | `config` | `Dict[str, Any]` | Full configuration dict | + + ## Type aliases + + Public aliases over Fern-generated types: `LlmConfig`, `SttConfig`, `AsrConfig` (= `SttConfig`), `MllmConfig`, `AvatarConfig`, session/conversation types, and think types (`ThinkOnListeningAction`, etc.). + + Think value constants: `ThinkOnListeningActionInject`, `ThinkOnListeningActionInterrupt`, `ThinkOnListeningActionIgnore`, `ThinkOnThinkingActionInterrupt`, `ThinkOnThinkingActionIgnore`, `ThinkOnSpeakingActionInterrupt`, `ThinkOnSpeakingActionIgnore`. + + ## Cross-SDK discovery map + + | Concept | Python | TypeScript | Go | + |---|---|---|---| + | STT payload alias (wire: `asr`) | `SttConfig` / `AsrConfig` | `SttConfig` / `AsrConfig` | `AsrConfig` / `SttConfig` | + | xAI MLLM (primary) | `XaiGrok` | `XaiGrok` | `XaiGrok` / `NewXaiGrok` | + | Avatar token helper | `is_avatar_token_managed` | `isAvatarTokenManaged` | `IsAvatarTokenManaged` | + | Think inject constant | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | + status: unresolved + - id: patch-7465fada + content_hash: sha256:9c6ed2e5f48702293eed8b213cc31cce63a7ed5a1ad16a0b23e791c13e77746f + original_commit: 7465fadafa0f1e62051d99b42d0eeda85f31eeee + original_message: "fix(agentkit): resolve Python session typing issues" + original_author: digitallysavvy + base_generation: b66d871314ca0e5929cb9c9095949a7fd5e856a7 + files: + - src/agora_agent/agentkit/agent_session.py + patch_content: | + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index dbff562..dca9ee8 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -24,6 +24,7 @@ from .avatar_types import ( + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + theirs_snapshot: + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent, GetTurnsOptions, SayOptions, ThinkOptions + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import ( + get_preset_category, + infer_asr_preset, + infer_llm_preset, + infer_tts_preset, + normalize_preset_input, + resolve_session_presets, + ) + from .token import generate_convo_ai_token, _parse_numeric_uid + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, preset, + pipeline_id, expires_in, debug, warn + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(self._agent_uid, "agent_uid"), + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(str(params["agora_uid"]), "avatar agora_uid"), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties( + self, + token_opts: typing.Dict[str, typing.Any], + skip_vendor_validation_categories: typing.AbstractSet[str], + allow_missing_vendor_categories: typing.AbstractSet[str], + ) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation_categories=skip_vendor_validation_categories, + allow_missing_vendor_categories=allow_missing_vendor_categories, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.greeting_configs is not None: + llm["greeting_configs"] = self._dump_model(self._agent.greeting_configs) + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + def _vendor_validation_categories( + self, + pipeline_id: typing.Optional[str], + ) -> typing.Tuple[typing.Set[str], typing.Set[str]]: + skip_categories: typing.Set[str] = set() + allow_missing_categories: typing.Set[str] = {"asr", "llm", "tts"} if pipeline_id else set() + + preset = normalize_preset_input(self._preset) + if preset: + for item in preset.split(","): + category = get_preset_category(item) + if category is not None: + skip_categories.add(category) + allow_missing_categories.add(category) + + if infer_asr_preset(self._agent.stt): + skip_categories.add("asr") + if infer_llm_preset(self._agent.llm): + skip_categories.add("llm") + if infer_tts_preset(self._agent.tts): + skip_categories.add("tts") + return skip_categories, allow_missing_categories + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @staticmethod + def _response_pagination(response: typing.Any) -> typing.Any: + if isinstance(response, dict): + return response.get("pagination") + return getattr(response, "pagination", None) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> agent = agent.with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + pipeline_id = self._pipeline_id if self._pipeline_id is not None else self._agent.pipeline_id + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + skip_categories, allow_missing_categories = self._vendor_validation_categories(pipeline_id) + properties = self._build_start_properties( + token_opts, + skip_vendor_validation_categories=skip_categories, + allow_missing_vendor_categories=allow_missing_categories, + ) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> agent = agent.with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + pipeline_id = self._pipeline_id if self._pipeline_id is not None else self._agent.pipeline_id + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + skip_categories, allow_missing_categories = self._vendor_validation_categories(pipeline_id) + properties = self._build_start_properties( + token_opts, + skip_vendor_validation_categories=skip_categories, + allow_missing_vendor_categories=allow_missing_categories, + ) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + - id: patch-d29165c4 + content_hash: sha256:be59d1d3efc435d5e0b83305b2cd39ce3dad4534a4125de18028c137e692e659 + original_commit: d29165c4ddd8296af703a4e9ed848516f563dd1b + original_message: make python compat package publishable + original_author: chenyuguo + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - compat/agora-agent-server-sdk/README.md + - compat/agora-agent-server-sdk/pyproject.toml + - compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + patch_content: |+ + From d29165c4ddd8296af703a4e9ed848516f563dd1b Mon Sep 17 00:00:00 2001 + From: chenyuguo + Date: Wed, 27 May 2026 17:24:50 +0800 + Subject: [PATCH] make python compat package publishable + + --- + compat/agora-agent-server-sdk/README.md | 2 ++ + compat/agora-agent-server-sdk/pyproject.toml | 3 +++ + .../src/agora_agent_server_sdk_compat/__init__.py | 1 + + 3 files changed, 6 insertions(+) + create mode 100644 compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + + diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md + index 1388836..cff3cfe 100644 + --- a/compat/agora-agent-server-sdk/README.md + +++ b/compat/agora-agent-server-sdk/README.md + @@ -9,3 +9,5 @@ pip install agora-agents + ``` + + This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. + + + +It intentionally contains only a minimal compatibility module so the distribution can be built and published cleanly with Poetry. + diff --git a/compat/agora-agent-server-sdk/pyproject.toml b/compat/agora-agent-server-sdk/pyproject.toml + index 8efbe53..ac93128 100644 + --- a/compat/agora-agent-server-sdk/pyproject.toml + +++ b/compat/agora-agent-server-sdk/pyproject.toml + @@ -26,6 +26,9 @@ classifiers = [ + "Topic :: Software Development :: Libraries :: Python Modules", + "Typing :: Typed" + ] + +packages = [ + + { include = "agora_agent_server_sdk_compat", from = "src"} + +] + + [tool.poetry.urls] + Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python' + diff --git a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + new file mode 100644 + index 0000000..55522c6 + --- /dev/null + +++ b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + @@ -0,0 +1 @@ + +"""Compatibility package for the renamed agora-agents distribution.""" + -- + 2.52.0 + + theirs_snapshot: + compat/agora-agent-server-sdk/README.md: | + # agora-agent-server-sdk + + This package has been renamed to `agora-agents`. + + New projects should install: + + ```sh + pip install agora-agents + ``` + + This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. + + It intentionally contains only a minimal compatibility module so the distribution can be built and published cleanly with Poetry. + compat/agora-agent-server-sdk/pyproject.toml: | + [project] + name = "agora-agent-server-sdk" + + [tool.poetry] + name = "agora-agent-server-sdk" + version = "v2.0.0" + description = "Compatibility shim for the renamed agora-agents package." + readme = "README.md" + authors = [] + keywords = [] + + classifiers = [ + "Intended Audience :: Developers", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Operating System :: OS Independent", + "Operating System :: POSIX", + "Operating System :: MacOS", + "Operating System :: POSIX :: Linux", + "Operating System :: Microsoft :: Windows", + "Topic :: Software Development :: Libraries :: Python Modules", + "Typing :: Typed" + ] + packages = [ + { include = "agora_agent_server_sdk_compat", from = "src"} + ] + + [tool.poetry.urls] + Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python' + + [tool.poetry.dependencies] + python = "^3.8" + agora-agents = ">=2.0.0,<3.0.0" + + [build-system] + requires = ["poetry-core"] + build-backend = "poetry.core.masonry.api" + compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py: | + """Compatibility package for the renamed agora-agents distribution.""" + status: unresolved + - id: patch-fae1249a + content_hash: sha256:01bf21f3cc4c784dfcff80a48c9c7bb3123af4327a567b7c990b528e9780e9a2 + original_commit: fae1249a20c53761a2eb5515a1bf92ca666760d1 + original_message: Re-export agora-agents API from legacy PyPI compatibility package The compat distribution delegates to agora_agent via __getattr__ and documents both import paths in its README. + original_author: digitallysavvy + base_generation: b66d871314ca0e5929cb9c9095949a7fd5e856a7 + files: + - compat/agora-agent-server-sdk/README.md + - compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + patch_content: |+ + From fae1249a20c53761a2eb5515a1bf92ca666760d1 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 27 May 2026 16:58:18 -0400 + Subject: [PATCH] Re-export agora-agents API from legacy PyPI compatibility + package The compat distribution delegates to agora_agent via __getattr__ and + documents both import paths in its README. + + --- + compat/agora-agent-server-sdk/README.md | 7 +++++-- + .../src/agora_agent_server_sdk_compat/__init__.py | 14 +++++++++++++- + 2 files changed, 18 insertions(+), 3 deletions(-) + + diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md + index cff3cfe..e43d1d8 100644 + --- a/compat/agora-agent-server-sdk/README.md + +++ b/compat/agora-agent-server-sdk/README.md + @@ -8,6 +8,9 @@ New projects should install: + pip install agora-agents + ``` + + -This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. + +This compatibility package re-exports the public API from `agora-agents` to support existing installs during the migration window. The primary import path remains `agora_agent`; you can also import from `agora_agent_server_sdk_compat`: + + -It intentionally contains only a minimal compatibility module so the distribution can be built and published cleanly with Poetry. + +```python + +from agora_agent import Agora, Area + +from agora_agent_server_sdk_compat import Agora, Area + +``` + diff --git a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + index 55522c6..6283244 100644 + --- a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + +++ b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + @@ -1 +1,13 @@ + -"""Compatibility package for the renamed agora-agents distribution.""" + +"""Compatibility re-exports for the renamed agora-agents package.""" + + + +import agora_agent as _agora_agent + + + +__all__ = getattr(_agora_agent, "__all__", []) + + + + + +def __getattr__(name: str): + + return getattr(_agora_agent, name) + + + + + +def __dir__(): + + return dir(_agora_agent) + -- + 2.52.0 + + theirs_snapshot: + compat/agora-agent-server-sdk/README.md: | + # agora-agent-server-sdk + + This package has been renamed to `agora-agents`. + + New projects should install: + + ```sh + pip install agora-agents + ``` + + This compatibility package re-exports the public API from `agora-agents` to support existing installs during the migration window. The primary import path remains `agora_agent`; you can also import from `agora_agent_server_sdk_compat`: + + ```python + from agora_agent import Agora, Area + from agora_agent_server_sdk_compat import Agora, Area + ``` + compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py: | + """Compatibility re-exports for the renamed agora-agents package.""" + + import agora_agent as _agora_agent + + __all__ = getattr(_agora_agent, "__all__", []) + + + def __getattr__(name: str): + return getattr(_agora_agent, name) + + + def __dir__(): + return dir(_agora_agent) + user_owned: true + - id: patch-fc9d93c3 + content_hash: sha256:93877741bdad745fda5dd549d7c3dd6bc315f4574aabd2defb52c0c795bff011 + original_commit: fc9d93c3026a6109d8a5e8b386418592f8d121c5 + original_message: Document agora-agents PyPI install name and migration notes + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/getting-started/installation.md + patch_content: | + diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md + index c14bdb2..f6f1750 100644 + --- a/docs/getting-started/installation.md + +++ b/docs/getting-started/installation.md + @@ -13,13 +13,13 @@ description: Install the Agora Conversational AI Python SDK. + ## Install with pip + + ```sh + -pip install agora-agent-sdk + +pip install agora-agents + ``` + + ## Install with Poetry + + ```sh + -poetry add agora-agent-sdk + +poetry add agora-agents + ``` + + ## Dependencies + theirs_snapshot: + docs/getting-started/installation.md: | + --- + sidebar_position: 1 + title: Installation + description: Install the Agora Conversational AI Python SDK. + --- + + # Installation + + ## Prerequisites + + - Python >= 3.8 + + ## Install with pip + + ```sh + pip install agora-agents + ``` + + ## Install with Poetry + + ```sh + poetry add agora-agents + ``` + + ## Dependencies + + The following packages are installed automatically: + + | Package | Purpose | + |---|---| + | `httpx` (>= 0.21.2) | HTTP client for sync and async requests | + | `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | + | `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | + + ## Sync vs. Async + + The SDK supports both synchronous and asynchronous usage: + + - **Synchronous** — import `Agora` from `agora_agent` and use blocking method calls + - **Asynchronous** — import `AsyncAgora` and `AsyncAgentSession` from `agora_agent` and use `await` with all API calls + + ```python + # Sync + from agora_agent import Agora, Area + + # Async + from agora_agent import AsyncAgora, AsyncAgentSession, Area + ``` + + Both clients share the same constructor parameters and capabilities. See [Authentication](./authentication.md) for setup details. + status: unresolved + - id: patch-44c21c14 + content_hash: sha256:920a8a5905a3bbb134edb28b007c5c0b1b4b2c1f75753140fef305b14a64e3e0 + original_commit: 44c21c14a14aa7ad469a18ce86024ff14ca2bf9b + original_message: Re-export AgentKit symbols from agora_agent package root Extend __getattr__ and __all__ so vendor classes, presets, and helpers are importable via `from agora_agent import ...`. Add tests and update class docstring examples to use the root import path. + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + - tests/custom/test_root_exports.py + patch_content: | + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index a820291..f84862c 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -231,8 +231,7 @@ class Agent: + + Examples + -------- + - >>> from agora_agent.agentkit import Agent + - >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT + + >>> from agora_agent import Agent, OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index fb8e548..a749d1e 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -412,12 +412,10 @@ class AgentSession(_AgentSessionBase): + + Examples + -------- + - >>> from agora_agent import Agora, Area + - >>> from agora_agent.agentkit import Agent + + >>> from agora_agent import Agora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + - >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + @@ -735,12 +733,10 @@ class AsyncAgentSession(_AgentSessionBase): + + Examples + -------- + - >>> from agora_agent import AsyncAgora, Area + - >>> from agora_agent.agentkit import Agent + + >>> from agora_agent import AsyncAgora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + - >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + diff --git a/tests/custom/test_root_exports.py b/tests/custom/test_root_exports.py + new file mode 100644 + index 0000000..9b2f508 + --- /dev/null + +++ b/tests/custom/test_root_exports.py + @@ -0,0 +1,29 @@ + +import pytest + + + +import agora_agent + +import agora_agent.agentkit as agentkit + + + + + +def test_root_exports_match_agentkit_for_common_symbols() -> None: + + for name in ("Agent", "DeepgramSTT", "OpenAI", "AgentPresets", "generate_rtc_token", "DataChannel"): + + assert getattr(agora_agent, name) is getattr(agentkit, name) + + + + + +def test_root_exports_fern_client_symbols() -> None: + + assert agora_agent.Agora is not None + + assert agora_agent.Area is not None + + assert agora_agent.AsyncAgora is not None + + + + + +def test_unknown_root_export_raises_attribute_error() -> None: + + with pytest.raises(AttributeError): + + _ = agora_agent.NotARealExportName + + + + + +def test_dir_includes_agentkit_vendor_exports() -> None: + + assert "DeepgramSTT" in dir(agora_agent) + + + + + +def test_all_includes_agentkit_vendor_exports() -> None: + + assert "DeepgramSTT" in agora_agent.__all__ + + assert "OpenAI" in agora_agent.__all__ + theirs_snapshot: + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.list_agents_response_data_list_item_status import ListAgentsResponseDataListItemStatus + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection + from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode + from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs + from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode + from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse, + ) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + AsrConfig = SttConfig + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + MllmVendor = StartAgentsRequestPropertiesMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + SessionStatus = ListAgentsResponseDataListItemStatus + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + + class ThinkOptions(typing_extensions.TypedDict, total=False): + on_listening_action: AgentThinkAgentManagementRequestOnListeningAction + on_thinking_action: AgentThinkAgentManagementRequestOnThinkingAction + on_speaking_action: AgentThinkAgentManagementRequestOnSpeakingAction + interruptable: bool + metadata: typing.Dict[str, str] + + + class GetTurnsOptions(typing_extensions.TypedDict, total=False): + page_index: int + page_size: int + + + class SayOptions(typing_extensions.TypedDict, total=False): + priority: SpeakAgentsRequestPriority + interruptable: bool + + + class SessionOptions(typing_extensions.TypedDict, total=False): + name: str + channel: str + token: str + agent_uid: str + remote_uids: typing.List[str] + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + # LLM sub-type aliases + LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs + LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode + McpServersItem = StartAgentsRequestPropertiesLlmMcpServersItem + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + # Think type aliases and response + ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + ThinkResponse = AgentThinkAgentManagementResponse + + from .token import generate_convo_ai_token, _parse_numeric_uid, _validate_expires_in + + + def _dump_optional_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Examples + -------- + >>> from agora_agent import Agent, OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + greeting_configs: typing.Optional[LlmGreetingConfigs] = None, + ): + self._name = name + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + self._greeting_configs = greeting_configs + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_greeting_configs(self, configs: LlmGreetingConfigs) -> "Agent": + """Returns a new Agent with greeting playback configuration.""" + new_agent = self._clone() + new_agent._greeting_configs = configs + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + The failure message is played via TTS when the LLM call fails. + """ + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Returns a new Agent with the specified maximum conversation history length.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def greeting_configs(self) -> typing.Optional[LlmGreetingConfigs]: + return self._greeting_configs + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + "greeting_configs": self._greeting_configs, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + uid=_parse_numeric_uid(agent_uid, "agent_uid"), + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._greeting_configs is not None: + llm_config["greeting_configs"] = _dump_optional_model(self._greeting_configs) + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + if self._stt is not None: + base_kwargs["asr"] = self._stt + + return StartAgentsRequestProperties(**base_kwargs) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + new_agent._greeting_configs = self._greeting_configs + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent, GetTurnsOptions, SayOptions, ThinkOptions + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token, _parse_numeric_uid + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(self._agent_uid, "agent_uid"), + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(str(params["agora_uid"]), "avatar agora_uid"), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.greeting_configs is not None: + llm["greeting_configs"] = self._dump_model(self._agent.greeting_configs) + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @staticmethod + def _response_pagination(response: typing.Any) -> typing.Any: + if isinstance(response, dict): + return response.get("pagination") + return getattr(response, "pagination", None) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + tests/custom/test_root_exports.py: | + import pytest + + import agora_agent + import agora_agent.agentkit as agentkit + + + def test_root_exports_match_agentkit_for_common_symbols() -> None: + for name in ("Agent", "DeepgramSTT", "OpenAI", "AgentPresets", "generate_rtc_token", "DataChannel"): + assert getattr(agora_agent, name) is getattr(agentkit, name) + + + def test_root_exports_fern_client_symbols() -> None: + assert agora_agent.Agora is not None + assert agora_agent.Area is not None + assert agora_agent.AsyncAgora is not None + + + def test_unknown_root_export_raises_attribute_error() -> None: + with pytest.raises(AttributeError): + _ = agora_agent.NotARealExportName + + + def test_dir_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in dir(agora_agent) + + + def test_all_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in agora_agent.__all__ + assert "OpenAI" in agora_agent.__all__ + status: unresolved + - id: patch-d475306b + content_hash: sha256:407af5e7564d6e8d0b91f1e117cb433aec931f083225af53c6df2abfff281b22 + original_commit: d475306bd42279984bcf4934b900003e8e02c4eb + original_message: Move package rename guidance to installation docs and protect manual paths in Fern ignore. Consolidate migration notes into the installation guide with next-step links, add a brief README pointer, and exclude README, compat, and workflow files from Fern generation. + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - compat/agora-agent-server-sdk/README.md + - docs/getting-started/installation.md + patch_content: | + diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md + index e43d1d8..1da36aa 100644 + --- a/compat/agora-agent-server-sdk/README.md + +++ b/compat/agora-agent-server-sdk/README.md + @@ -14,3 +14,5 @@ This compatibility package re-exports the public API from `agora-agents` to supp + from agora_agent import Agora, Area + from agora_agent_server_sdk_compat import Agora, Area + ``` + + + +Maintainers: dual-publish steps live in the repository release workflow, not in the root README. + diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md + index 04b48da..8fca9ab 100644 + --- a/docs/getting-started/installation.md + +++ b/docs/getting-started/installation.md + @@ -53,4 +53,15 @@ from agora_agent import AsyncAgora, AsyncAgentSession, Area + | `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | + | `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | + + -See [Authentication](./authentication.md) for setup details. + +## Next steps + + + +- [Authentication](./authentication.md) — configure your credentials + +- [Quick Start](./quick-start.md) — build your first conversational agent + + + +## Migrating from a previous package name + + + +The PyPI distribution was renamed from `agora-agent-server-sdk` to `agora-agents` in v2.0.0. Install `agora-agents`; the import path remains `agora_agent`. + + + +The legacy PyPI name remains available as a compatibility shim that re-exports `agora-agents`. See [compat/agora-agent-server-sdk](../../compat/agora-agent-server-sdk/README.md). + + + +For release and version details, see [changelog — Migration notes](../../changelog.md#migration-notes). + theirs_snapshot: + compat/agora-agent-server-sdk/README.md: | + # agora-agent-server-sdk + + This package has been renamed to `agora-agents`. + + New projects should install: + + ```sh + pip install agora-agents + ``` + + This compatibility package re-exports the public API from `agora-agents` to support existing installs during the migration window. The primary import path remains `agora_agent`; you can also import from `agora_agent_server_sdk_compat`: + + ```python + from agora_agent import Agora, Area + from agora_agent_server_sdk_compat import Agora, Area + ``` + + Maintainers: dual-publish steps live in the repository release workflow, not in the root README. + docs/getting-started/installation.md: | + --- + sidebar_position: 1 + title: Installation + description: Install the Agora Conversational AI Python SDK. + --- + + # Installation + + ## Prerequisites + + - Python >= 3.8 + + ## Install with pip + + ```sh + pip install agora-agents + ``` + + ## Install with Poetry + + ```sh + poetry add agora-agents + ``` + + ## Imports + + ```python + from agora_agent import Agent, Agora, Area, DeepgramSTT, OpenAI + ``` + + The package installs as `agora-agents` and imports as `agora_agent`. + + ## Sync vs. Async + + The SDK supports both synchronous and asynchronous usage: + + - **Synchronous** — import `Agora` from `agora_agent` and use blocking method calls + - **Asynchronous** — import `AsyncAgora` and `AsyncAgentSession` from `agora_agent` and use `await` with all API calls + + ```python + # Sync + from agora_agent import Agora, Area + + # Async + from agora_agent import AsyncAgora, AsyncAgentSession, Area + ``` + + ## Dependencies + + | Package | Purpose | + | ------------------------------ | ------------------------------------------------------ | + | `httpx` (>= 0.21.2) | HTTP client for sync and async requests | + | `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | + | `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | + + ## Next steps + + - [Authentication](./authentication.md) — configure your credentials + - [Quick Start](./quick-start.md) — build your first conversational agent + + ## Migrating from a previous package name + + The PyPI distribution was renamed from `agora-agent-server-sdk` to `agora-agents` in v2.0.0. Install `agora-agents`; the import path remains `agora_agent`. + + The legacy PyPI name remains available as a compatibility shim that re-exports `agora-agents`. See [compat/agora-agent-server-sdk](../../compat/agora-agent-server-sdk/README.md). + + For release and version details, see [changelog — Migration notes](../../changelog.md#migration-notes). + status: unresolved + - id: patch-c9355576 + content_hash: sha256:83b3b6148b21f2b4d53ee67321777522f5f4e871b61ea3b23f3a6b88ca052769 + original_commit: c93555763ffd63267a737b3e430217a890f203db + original_message: Streamline Python docs and README for app-credentials-first onboarding. Remove duplicated low-level client examples from the README, de-emphasize legacy auth modes, refocus the low-level API guide on AgentKit with telephony escape hatches, and update Agora-managed model terminology. + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/getting-started/authentication.md + - docs/guides/low-level-api.md + patch_content: | + diff --git a/docs/getting-started/authentication.md b/docs/getting-started/authentication.md + index 31dcc56..74c62cd 100644 + --- a/docs/getting-started/authentication.md + +++ b/docs/getting-started/authentication.md + @@ -46,41 +46,6 @@ session = agent.create_session( + print(client.auth_mode) # "app-credentials" + ``` + + -## Other auth modes + +## Legacy auth modes + + -The SDK also supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. These are not recommended for new applications. + - + -### Token auth (`auth_token`) + - + -Pass a pre-minted Agora REST token on the client. You must also supply the RTC join token on `create_session(..., token=...)`. + - + -```python + -client = Agora( + - area=Area.US, + - app_id="your-app-id", + - app_certificate="your-app-certificate", + - auth_token="your-rest-auth-token", + -) + - + -session = agent.create_session( + - client, + - channel="room-123", + - agent_uid="1", + - remote_uids=["100"], + - token="your-rtc-join-token", + -) + -``` + - + -### Basic Auth (`customer_id` + `customer_secret`) + - + -Uses HTTP Basic Auth with Customer ID and Secret from Agora Console. Avoid for new integrations — the same credentials are sent on every request instead of minting fresh tokens. + - + -```python + -client = Agora( + - area=Area.US, + - app_id="your-app-id", + - app_certificate="your-app-certificate", + - customer_id="your-customer-id", + - customer_secret="your-customer-secret", + -) + -``` + +The generated client still supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. Do not use those modes for new session integrations. Use app credentials so AgentKit can mint short-lived ConvoAI REST auth and RTC join tokens for each session. + diff --git a/docs/guides/low-level-api.md b/docs/guides/low-level-api.md + index 6677b45..47397b7 100644 + --- a/docs/guides/low-level-api.md + +++ b/docs/guides/low-level-api.md + @@ -1,187 +1,55 @@ + --- + sidebar_position: 10 + title: Low-Level API + -description: Direct client.agents.start() usage without the builder pattern. + +description: Use generated clients for escape-hatch APIs while keeping agent sessions on AgentKit. + --- + + # Low-Level API + + -For full control over request payloads you can call the generated clients directly and pass raw types such as `StartAgentsRequestProperties`, `Tts_Elevenlabs`, and `StartAgentsRequestPropertiesAsr`. Use this when you need vendor or options not exposed by the agentkit, or when integrating with generated types from the API spec. + +Use the `Agent` builder and `AgentSession` for conversational agent starts. That path generates ConvoAI REST auth and RTC join tokens from `app_id` and `app_certificate`, so application code does not need prebuilt REST tokens, RTC tokens, Customer ID, or Customer Secret. + + -## Raw telephony and phone-number APIs + - + -AgentKit focuses on realtime agent session helpers. Telephony call status, call hangup, and phone-number management are exposed through the generated low-level clients: + - + -- `client.telephony` for call status and hangup operations + -- `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + +Generated clients are still available for API surface that AgentKit does not wrap yet, such as telephony and phone-number management. + + -## Cascading flow (ASR → LLM → TTS) + +## Client setup + + ```python + from agora_agent import Agora, Area + -from agora_agent.agents import ( + - StartAgentsRequestProperties, + - StartAgentsRequestPropertiesAsr, + - StartAgentsRequestPropertiesLlm, + -) + -from agora_agent.types.eleven_labs_tts_params import ElevenLabsTtsParams + -from agora_agent.types.tts import Tts_Elevenlabs + + client = Agora( + area=Area.US, + - app_id="YOUR_APP_ID", + - app_certificate="YOUR_APP_CERTIFICATE", + - auth_token="your-rest-auth-token", + -) + -client.agents.start( + - client.app_id, + - name="unique_name", + - properties=StartAgentsRequestProperties( + - channel="channel_name", + - token="token", + - agent_rtc_uid="1001", + - remote_rtc_uids=["1002"], + - idle_timeout=120, + - asr=StartAgentsRequestPropertiesAsr( + - language="en-US", + - vendor="deepgram", + - params={"api_key": "YOUR_DEEPGRAM_API_KEY"}, + - ), + - tts=Tts_Elevenlabs( + - params=ElevenLabsTtsParams( + - key="YOUR_ELEVENLABS_API_KEY", + - model_id="eleven_flash_v2_5", + - voice_id="pNInz6obpgDQGcFmaJgB", + - sample_rate=24000, + - ), + - ), + - llm=StartAgentsRequestPropertiesLlm( + - url="https://api.openai.com/v1/chat/completions", + - api_key="", + - system_messages=[ + - {"role": "system", "content": "You are a helpful chatbot."} + - ], + - params={"model": "gpt-4o-mini"}, + - max_history=32, + - greeting_message="Hello, how can I assist you today?", + - failure_message="Please hold on a second.", + - ), + - ), + + app_id="your-app-id", + + app_certificate="your-app-certificate", + ) + ``` + + -## Async (low-level) + +## Raw telephony and phone-number APIs + + -```python + -import asyncio + -from agora_agent import Area, AsyncAgora + -from agora_agent.agents import ( + - StartAgentsRequestProperties, + - StartAgentsRequestPropertiesAsr, + - StartAgentsRequestPropertiesLlm, + -) + -from agora_agent.types.eleven_labs_tts_params import ElevenLabsTtsParams + -from agora_agent.types.tts import Tts_Elevenlabs + +AgentKit focuses on realtime agent session helpers. Use generated clients for operational APIs: + + -client = AsyncAgora( + - area=Area.US, + - app_id="YOUR_APP_ID", + - app_certificate="YOUR_APP_CERTIFICATE", + - auth_token="your-rest-auth-token", + +- `client.telephony` for call status and hangup operations + +- `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + + + +```python + +calls = client.telephony.list( + + appid=client.app_id, + + type="sip", + ) + + -async def main() -> None: + - await client.agents.start( + - client.app_id, + - name="unique_name", + - properties=StartAgentsRequestProperties( + - channel="channel_name", + - token="token", + - agent_rtc_uid="1001", + - remote_rtc_uids=["1002"], + - idle_timeout=120, + - asr=StartAgentsRequestPropertiesAsr( + - language="en-US", + - vendor="deepgram", + - params={"api_key": "YOUR_DEEPGRAM_API_KEY"}, + - ), + - tts=Tts_Elevenlabs( + - params=ElevenLabsTtsParams( + - key="YOUR_ELEVENLABS_API_KEY", + - model_id="eleven_flash_v2_5", + - voice_id="pNInz6obpgDQGcFmaJgB", + - sample_rate=24000, + - ), + - ), + - llm=StartAgentsRequestPropertiesLlm( + - url="https://api.openai.com/v1/chat/completions", + - api_key="", + - system_messages=[ + - {"role": "system", "content": "You are a helpful chatbot."} + - ], + - params={"model": "gpt-4o-mini"}, + - max_history=32, + - greeting_message="Hello, how can I assist you today?", + - failure_message="Please hold on a second.", + - ), + - ), + - ) + - + -asyncio.run(main()) + +for call in calls: + + print(call.id, call.state) + ``` + + -## MLLM flow (multimodal) + +## Direct agent APIs + + -For real-time audio with OpenAI Realtime or Google Gemini Live, use the MLLM flow instead of the cascading ASR → LLM → TTS flow. See the [MLLM Overview](https://docs.agora.io/en/conversational-ai/models/mllm/overview). + +`client.agents` exposes the generated REST surface for advanced integrations. Prefer `agent.create_session(...).start()` for new session starts because it handles auth, token generation, vendor serialization, lifecycle state, and avatar enrichment. + + -```python + -from agora_agent import Agora, Area + -from agora_agent.agents import ( + - StartAgentsRequestProperties, + - StartAgentsRequestPropertiesMllm, + - StartAgentsRequestPropertiesMllmVendor, + - StartAgentsRequestPropertiesTts, + - StartAgentsRequestPropertiesTtsVendor, + - StartAgentsRequestPropertiesLlm, + -) + +If you need an endpoint that is not wrapped by `AgentSession`, use `session.raw` after creating the session: + + -client = Agora( + - area=Area.US, + - app_id="YOUR_APP_ID", + - app_certificate="YOUR_APP_CERTIFICATE", + - auth_token="your-rest-auth-token", + -) + - + -client.agents.start( + - client.app_id, + - name="mllm_agent", + - properties=StartAgentsRequestProperties( + - channel="channel_name", + - token="your_token", + - agent_rtc_uid="1001", + - remote_rtc_uids=["1002"], + - idle_timeout=120, + - mllm=StartAgentsRequestPropertiesMllm( + - enable=True, + - url="wss://api.openai.com/v1/realtime", + - api_key="", + - vendor=StartAgentsRequestPropertiesMllmVendor.OPENAI, + - params={ + - "model": "gpt-4o-realtime-preview", + - "voice": "alloy", + - }, + - input_modalities=["audio"], + - output_modalities=["text", "audio"], + - greeting_message="Hello! I'm ready to chat in real-time.", + - turn_detection={ + - "mode": "server_vad", + - "server_vad_config": { + - "idle_timeout_ms": 5000, + - }, + - }, + - ), + - ), + +```python + +info = session.raw.get( + + appid=session.app_id, + + agent_id=session.id, + ) + ``` + + -For more on the agentkit-based MLLM flow, see [MLLM Flow](./mllm-flow.md). + +You must pass `appid` and `agent_id` manually when using generated raw methods. + theirs_snapshot: + docs/getting-started/authentication.md: | + --- + sidebar_position: 2 + title: Authentication + description: Configure the Python SDK with app credentials and understand other supported auth modes. + --- + + # Authentication + + Create `Agora` or `AsyncAgora` with `app_id` and `app_certificate` only. The SDK mints a fresh ConvoAI REST token for each API call and generates the RTC join token when the session starts. + + ## App credentials + + ```python + from agora_agent import Agent, Agora, Area, DeepgramSTT, OpenAI, MiniMaxTTS + + client = Agora( + area=Area.US, + app_id="your-app-id", + app_certificate="your-app-certificate", + ) + + agent = ( + Agent(instructions="Be concise.") + .with_stt(DeepgramSTT(model="nova-3")) + .with_llm(OpenAI(model="gpt-4o-mini")) + .with_tts(MiniMaxTTS(model="speech_2_6_turbo", voice_id="English_captivating_female1")) + ) + + session = agent.create_session( + client, + channel="room-123", + agent_uid="1", + remote_uids=["100"], + ) + ``` + + ## Why app credentials + + - Fresh short-lived tokens per API call instead of reusing long-lived credentials + - No Customer ID / Customer Secret in request headers + - No manual REST or RTC token provisioning in application code + + ## Inspecting auth mode + + ```python + print(client.auth_mode) # "app-credentials" + ``` + + ## Legacy auth modes + + The generated client still supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. Do not use those modes for new session integrations. Use app credentials so AgentKit can mint short-lived ConvoAI REST auth and RTC join tokens for each session. + docs/guides/low-level-api.md: | + --- + sidebar_position: 10 + title: Low-Level API + description: Use generated clients for escape-hatch APIs while keeping agent sessions on AgentKit. + --- + + # Low-Level API + + Use the `Agent` builder and `AgentSession` for conversational agent starts. That path generates ConvoAI REST auth and RTC join tokens from `app_id` and `app_certificate`, so application code does not need prebuilt REST tokens, RTC tokens, Customer ID, or Customer Secret. + + Generated clients are still available for API surface that AgentKit does not wrap yet, such as telephony and phone-number management. + + ## Client setup + + ```python + from agora_agent import Agora, Area + + client = Agora( + area=Area.US, + app_id="your-app-id", + app_certificate="your-app-certificate", + ) + ``` + + ## Raw telephony and phone-number APIs + + AgentKit focuses on realtime agent session helpers. Use generated clients for operational APIs: + + - `client.telephony` for call status and hangup operations + - `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + + ```python + calls = client.telephony.list( + appid=client.app_id, + type="sip", + ) + + for call in calls: + print(call.id, call.state) + ``` + + ## Direct agent APIs + + `client.agents` exposes the generated REST surface for advanced integrations. Prefer `agent.create_session(...).start()` for new session starts because it handles auth, token generation, vendor serialization, lifecycle state, and avatar enrichment. + + If you need an endpoint that is not wrapped by `AgentSession`, use `session.raw` after creating the session: + + ```python + info = session.raw.get( + appid=session.app_id, + agent_id=session.id, + ) + ``` + + You must pass `appid` and `agent_id` manually when using generated raw methods. + status: unresolved + - id: patch-299e4bd9 + content_hash: sha256:e1470176436d28416d0ff67d8acc614060fae7b312f86c09b899a92d1c4adfe4 + original_commit: 299e4bd9cb59bd6144084332a7c3fa7bf260769f + original_message: "fix(agentkit): resolve provider config type checks" + original_author: digitallysavvy + base_generation: b66d871314ca0e5929cb9c9095949a7fd5e856a7 + files: + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/vendors/llm.py + - src/agora_agent/agentkit/vendors/mllm.py + - src/agora_agent/agentkit/vendors/stt.py + patch_content: |+ + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index 6275f04..ecf01c6 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -57,6 +57,8 @@ from ..agents.types.start_agents_request_properties_filler_words_content import + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + +from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + +from ..types.tts import Tts + from ..types.asr import Asr + from ..types.llm import Llm + from ..types.llm_style import LlmStyle as GeneratedLlmStyle + @@ -536,6 +538,23 @@ class Agent: + ) + return new_agent + + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + + """Returns a new Agent with the specified RTC audio scenario.""" + + new_agent = self._clone() + + if new_agent._parameters is None: + + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + + elif isinstance(new_agent._parameters, dict): + + new_agent._parameters = typing.cast( + + SessionParamsInput, + + {**new_agent._parameters, "audio_scenario": audio_scenario}, + + ) + + else: + + new_agent._parameters = self._copy_model_update( + + new_agent._parameters, + + {"audio_scenario": audio_scenario}, + + ) + + return new_agent + + + def with_failure_message(self, message: str) -> "Agent": + """Deprecated. Configure the failure message on the LLM or MLLM vendor instead.""" + new_agent = self._clone() + diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py + index 9156a01..5dd822d 100644 + --- a/src/agora_agent/agentkit/vendors/llm.py + +++ b/src/agora_agent/agentkit/vendors/llm.py + @@ -1,7 +1,10 @@ + -from typing import Any, Dict, List, Optional + +from typing import Any, Dict, List, Optional, Union + + from pydantic import BaseModel, ConfigDict, Field, model_validator + + +from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + + StartAgentsRequestPropertiesLlmGreetingConfigs, + +) + from .base import BaseLLM + + LlmGreetingConfigs = Dict[str, Any] + diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py + index 236a494..6a260d8 100644 + --- a/src/agora_agent/agentkit/vendors/mllm.py + +++ b/src/agora_agent/agentkit/vendors/mllm.py + @@ -1,3 +1,4 @@ + +import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + diff --git a/src/agora_agent/agentkit/vendors/stt.py b/src/agora_agent/agentkit/vendors/stt.py + index e5117b0..bb222a9 100644 + --- a/src/agora_agent/agentkit/vendors/stt.py + +++ b/src/agora_agent/agentkit/vendors/stt.py + @@ -89,6 +89,7 @@ class SpeechmaticsSTTOptions(BaseModel): + + api_key: str = Field(..., description="Speechmatics API key") + language: str = Field(..., description="Language code (e.g., en, es, fr)") + + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") + model: Optional[str] = Field(default=None, description="Model name") + uri: Optional[str] = Field(default=None, description="Speechmatics streaming WebSocket URL") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + @@ -124,6 +125,7 @@ class DeepgramSTTOptions(BaseModel): + api_key: Optional[str] = Field(default=None, description="Deepgram API key") + model: Optional[str] = Field(default=None, description="Model (e.g., nova-2, enhanced, base)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en-US)") + + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") + smart_format: Optional[bool] = Field(default=None, description="Enable smart formatting") + punctuation: Optional[bool] = Field(default=None, description="Enable punctuation") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + @@ -353,6 +355,7 @@ class SarvamSTTOptions(BaseModel): + + api_key: str = Field(..., description="Sarvam API key") + language: str = Field(..., description="Language code (e.g., en, hi, ta)") + + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") + model: Optional[str] = Field(default=None, description="Model name") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + theirs_snapshot: + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + import warnings + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.list_agents_response_data_list_item_status import ListAgentsResponseDataListItemStatus + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..types.asr import Asr + from ..types.llm import Llm + from ..types.llm_style import LlmStyle as GeneratedLlmStyle + from ..types.mllm import Mllm + from ..types.mllm_turn_detection import MllmTurnDetection + from ..types.mllm_turn_detection_mode import MllmTurnDetectionMode as GeneratedMllmTurnDetectionMode + from ..types.mllm_vendor import MllmVendor as GeneratedMllmVendor + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse, + ) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = Llm + LlmStyle = GeneratedLlmStyle + SttConfig = Asr + AsrConfig = SttConfig + SttVendor = typing.Any + TtsConfig = Tts + MllmConfig = Mllm + MllmVendor = GeneratedMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = MllmTurnDetection + MllmTurnDetectionMode = GeneratedMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + SessionStatus = ListAgentsResponseDataListItemStatus + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + + class ThinkOptions(typing_extensions.TypedDict, total=False): + on_listening_action: AgentThinkAgentManagementRequestOnListeningAction + on_thinking_action: AgentThinkAgentManagementRequestOnThinkingAction + on_speaking_action: AgentThinkAgentManagementRequestOnSpeakingAction + interruptable: bool + metadata: typing.Dict[str, str] + + + class GetTurnsOptions(typing_extensions.TypedDict, total=False): + page_index: int + page_size: int + + + class SayOptions(typing_extensions.TypedDict, total=False): + priority: SpeakAgentsRequestPriority + interruptable: bool + + + class SessionOptions(typing_extensions.TypedDict, total=False): + name: str + channel: str + token: str + agent_uid: str + remote_uids: typing.List[str] + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + # LLM sub-type aliases + LlmGreetingConfigs = typing.Dict[str, typing.Any] + LlmGreetingConfigsMode = typing.Any + McpServersItem = typing.Dict[str, typing.Any] + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + # Think type aliases and response + ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + ThinkResponse = AgentThinkAgentManagementResponse + + from .token import generate_convo_ai_token, _parse_numeric_uid, _validate_expires_in + + TurnDetectionLanguage = typing_extensions.Literal[ + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", + ] + + DEFAULT_TURN_DETECTION_LANGUAGE: TurnDetectionLanguage = "en-US" + TURN_DETECTION_LANGUAGE_VALUES: typing.Tuple[TurnDetectionLanguage, ...] = ( + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", + ) + _TURN_DETECTION_LANGUAGES = set(TURN_DETECTION_LANGUAGE_VALUES) + + + def _dump_optional_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + + def _is_turn_detection_language(value: typing.Any) -> bool: + return isinstance(value, str) and value in _TURN_DETECTION_LANGUAGES + + + def _validate_turn_detection_language(value: typing.Any) -> TurnDetectionLanguage: + if not _is_turn_detection_language(value): + raise ValueError(f"Invalid interaction language: {value}") + return value # type: ignore[return-value] + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Deprecated: + The Agent-level ``instructions``, ``greeting``, ``failure_message``, + ``max_history``, and ``greeting_configs`` convenience fields are kept + for compatibility. Configure those values on the LLM or MLLM vendor + instead. + + Examples + -------- + >>> from agora_agent import Agent, OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + greeting_configs: typing.Optional[LlmGreetingConfigs] = None, + pipeline_id: typing.Optional[str] = None, + ): + self._name = name + self._pipeline_id = pipeline_id + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + self._greeting_configs = greeting_configs + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + """Deprecated. Configure system messages on the LLM vendor instead.""" + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + """Deprecated. Configure the greeting on the LLM or MLLM vendor instead.""" + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_greeting_configs(self, configs: LlmGreetingConfigs) -> "Agent": + """Deprecated. Configure greeting playback on the LLM vendor instead.""" + new_agent = self._clone() + new_agent._greeting_configs = configs + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Deprecated. Configure the failure message on the LLM or MLLM vendor instead.""" + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Deprecated. Configure max history on the LLM vendor instead.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def pipeline_id(self) -> typing.Optional[str]: + """Published AI Studio pipeline ID used as this agent's base configuration.""" + return self._pipeline_id + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def greeting_configs(self) -> typing.Optional[LlmGreetingConfigs]: + return self._greeting_configs + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "pipeline_id": self._pipeline_id, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + "greeting_configs": self._greeting_configs, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + skip_vendor_validation_categories: typing.Optional[typing.AbstractSet[str]] = None, + allow_missing_vendor_categories: typing.Optional[typing.AbstractSet[str]] = None, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + uid=_parse_numeric_uid(agent_uid, "agent_uid"), + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + warnings.warn( + "skip_vendor_validation is deprecated and will be removed in a future release. " + "Use skip_vendor_validation_categories and allow_missing_vendor_categories instead.", + DeprecationWarning, + stacklevel=2, + ) + + skip_categories = set(skip_vendor_validation_categories or ()) + allow_missing_categories = set(allow_missing_vendor_categories or ()) + if skip_vendor_validation: + skip_categories.update({"asr", "llm", "tts"}) + allow_missing_categories.update({"asr", "llm", "tts"}) + + skip_asr_validation = skip_vendor_validation or "asr" in skip_categories + skip_llm_validation = skip_vendor_validation or "llm" in skip_categories + skip_tts_validation = skip_vendor_validation or "tts" in skip_categories + allow_missing_asr = "asr" in allow_missing_categories + allow_missing_llm = "llm" in allow_missing_categories + allow_missing_tts = "tts" in allow_missing_categories + + if not skip_asr_validation and (self._stt is not None or not allow_missing_asr): + base_kwargs["asr"] = self._resolve_asr_config() + base_kwargs["turn_detection"] = self._resolve_turn_detection_config() + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None and not (skip_tts_validation or allow_missing_tts): + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None and not (skip_llm_validation or allow_missing_llm): + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + if self._llm is not None and not skip_llm_validation: + base_kwargs["llm"] = self._resolve_llm_config() + if self._tts is not None and not skip_tts_validation: + base_kwargs["tts"] = self._tts + + return StartAgentsRequestProperties(**base_kwargs) + + def _resolve_llm_config(self) -> typing.Dict[str, typing.Any]: + llm_config = dict(self._llm or {}) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._greeting_configs is not None: + llm_config["greeting_configs"] = _dump_optional_model(self._greeting_configs) + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + return llm_config + + def _resolve_asr_config(self) -> typing.Dict[str, typing.Any]: + asr_config = dict(self._stt or {}) + asr_config.pop("language", None) + if not asr_config: + asr_config["vendor"] = "ares" + return asr_config + + def _resolve_turn_detection_config(self) -> TurnDetectionConfig: + existing_stt_language = self._stt.get("language") if self._stt is not None else None + existing_turn_detection_language = self._field_value(self._turn_detection, "language") + language = ( + existing_turn_detection_language + if existing_turn_detection_language is not None + else existing_stt_language + if _is_turn_detection_language(existing_stt_language) + else DEFAULT_TURN_DETECTION_LANGUAGE + ) + language = _validate_turn_detection_language(language) + if self._turn_detection is None: + return StartAgentsRequestPropertiesTurnDetection(language=language) + if isinstance(self._turn_detection, dict): + return typing.cast(TurnDetectionConfig, {**self._turn_detection, "language": language}) + return self._copy_model_update(self._turn_detection, {"language": language}) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._pipeline_id = self._pipeline_id + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + new_agent._greeting_configs = self._greeting_configs + return new_agent + src/agora_agent/agentkit/vendors/llm.py: | + from typing import Any, Dict, List, Optional, Union + + from pydantic import BaseModel, ConfigDict, Field, model_validator + + from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + StartAgentsRequestPropertiesLlmGreetingConfigs, + ) + from .base import BaseLLM + + LlmGreetingConfigs = Dict[str, Any] + _OPENAI_MANAGED_MODELS = {"gpt-4o-mini", "gpt-4.1-mini", "gpt-5-nano", "gpt-5-mini"} + + + def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Ensure each MCP server has transport set (API requires it). Default to streamable_http.""" + result = [] + for s in servers: + item = dict(s) + if item.get("transport") is None: + item["transport"] = "streamable_http" + result.append(item) + return result + + + def _dump_optional_model(value: Any) -> Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + class OpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: Optional[str] = Field(default=None, description="OpenAI API key") + model: str = Field(..., description="Model name") + base_url: Optional[str] = Field(default=None, description="Custom base URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + max_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + @model_validator(mode="after") + def _validate_byok_params(self) -> "OpenAIOptions": + if not self.model: + raise ValueError("OpenAI requires model") + if self.api_key is not None and self.base_url is None: + raise ValueError("OpenAI requires base_url when api_key is set") + if self.api_key is None and self.base_url is not None: + raise ValueError("OpenAI base_url is only valid when api_key is set") + if self.api_key is None and self.model.strip().lower() not in _OPENAI_MANAGED_MODELS: + raise ValueError("OpenAI requires api_key unless using a supported Agora-managed model") + if self.api_key is None and self.vendor is not None: + raise ValueError("OpenAI Agora-managed mode does not allow vendor") + return self + + class OpenAI(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # model is the default; explicit params entries extend/override it. + # This matches the TS SDK behaviour: { model, ...params }. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + + # Named fields take precedence over anything in the generic params dict. + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.base_url or "https://api.openai.com/v1/chat/completions", + "params": params, + "style": "openai", + "input_modalities": self.options.input_modalities or ["text"], + } + if self.options.api_key is not None: + config["api_key"] = self.options.api_key + if self.options.headers is not None: + config["headers"] = self.options.headers + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class AzureOpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Azure OpenAI API key") + model: str = Field(..., description="Azure deployment model name") + endpoint: str = Field(..., description="Azure endpoint URL") + deployment_name: str = Field(..., description="Azure deployment name") + api_version: str = Field(default="2024-08-01-preview", description="Azure API version") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + max_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class AzureOpenAI(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AzureOpenAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + url = ( + f"{self.options.endpoint}/openai/deployments/" + f"{self.options.deployment_name}/chat/completions" + f"?api-version={self.options.api_version}" + ) + config: Dict[str, Any] = { + "url": url, + "api_key": self.options.api_key, + "vendor": self.options.vendor or "azure", + "style": "openai", + "input_modalities": self.options.input_modalities or ["text"], + } + + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if params: + config["params"] = params + if self.options.headers is not None: + config["headers"] = self.options.headers + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class AnthropicOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anthropic API key") + model: str = Field(..., description="Model name") + url: str = Field(..., description="Anthropic messages endpoint URL") + max_tokens: int = Field(..., gt=0) + temperature: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Dict[str, str] = Field(..., description="Anthropic request headers") + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class Anthropic(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AnthropicOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.url, + "api_key": self.options.api_key, + "params": params, + "headers": self.options.headers, + "style": "anthropic", + "input_modalities": self.options.input_modalities or ["text"], + } + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class GeminiOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google AI API key") + model: str = Field(..., description="Model name") + url: Optional[str] = Field(default=None, description="Custom API endpoint URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_k: Optional[int] = Field(default=None, gt=0) + max_output_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class Gemini(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + if self.options.top_k is not None: + params["top_k"] = self.options.top_k + if self.options.max_output_tokens is not None: + params["max_output_tokens"] = self.options.max_output_tokens + + config: Dict[str, Any] = { + "url": self.options.url or "https://generativelanguage.googleapis.com/v1beta/models", + "api_key": self.options.api_key, + "params": params, + "style": "gemini", + "input_modalities": self.options.input_modalities or ["text"], + } + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class GroqOptions(OpenAIOptions): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Groq API key") + model: str = Field(..., description="Model name") + base_url: str = Field(..., description="Groq-compatible endpoint") + + + class Groq(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = GroqOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config = OpenAI(**_dump_optional_model(self.options)).to_config() + config["url"] = self.options.base_url + return config + + + class CustomLLMOptions(OpenAIOptions): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Custom LLM API key") + base_url: str = Field(..., description="OpenAI-compatible chat completions endpoint") + + + class CustomLLM(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = CustomLLMOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config = OpenAI(**_dump_optional_model(self.options)).to_config() + config["vendor"] = self.options.vendor or "custom" + return config + + + class VertexAILLMOptions(GeminiOptions): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Vertex AI access token or API key") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud location") + + + class VertexAILLM(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAILLMOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + options = _dump_optional_model(self.options) + options.pop("project_id", None) + options.pop("location", None) + config = Gemini(**options).to_config() + params = dict(config["params"]) + params["project_id"] = self.options.project_id + params["location"] = self.options.location + config["params"] = params + return config + + + class AmazonBedrockOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + access_key: str = Field(..., description="AWS access key ID") + secret_key: str = Field(..., description="AWS secret access key") + region: str = Field(..., description="AWS region") + model: str = Field(..., description="Amazon Bedrock model identifier") + max_tokens: Optional[int] = Field(default=None, gt=0) + url: Optional[str] = Field(default=None, description="Amazon Bedrock converse stream endpoint URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + + class AmazonBedrock(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AmazonBedrockOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.url or f"https://bedrock-runtime.{self.options.region}.amazonaws.com/model/{self.options.model}/converse-stream", + "access_key": self.options.access_key, + "secret_key": self.options.secret_key, + "region": self.options.region, + "model": self.options.model, + "params": params, + "style": "bedrock", + "input_modalities": self.options.input_modalities or ["text"], + } + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + return config + + + class DifyOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Dify API key") + url: str = Field(..., description="Dify workflow or chat endpoint") + model: str = Field(..., description="Dify model identifier") + user: Optional[str] = Field(default=None, description="Dify user identifier") + conversation_id: Optional[str] = Field(default=None, description="Dify conversation ID") + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0) + + + class Dify(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = DifyOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.user is not None: + params["user"] = self.options.user + if self.options.conversation_id is not None: + params["conversation_id"] = self.options.conversation_id + + config: Dict[str, Any] = { + "url": self.options.url, + "api_key": self.options.api_key, + "params": params, + "style": "dify", + "input_modalities": self.options.input_modalities or ["text"], + } + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + return config + src/agora_agent/agentkit/vendors/mllm.py: | + import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + + from ...types.mllm_turn_detection import MllmTurnDetection + from .base import BaseMLLM + + MllmTurnDetectionConfig = MllmTurnDetection + + + class OpenAIRealtimeOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="OpenAI API key") + model: Optional[str] = Field(default=None, description="Model name (e.g., gpt-4o-realtime-preview)") + voice: Optional[str] = Field(default=None, description="Voice identifier") + instructions: Optional[str] = Field(default=None, description="System instructions") + input_audio_transcription: Optional[Dict[str, Any]] = Field(default=None, description="Audio transcription settings") + url: Optional[str] = Field(default=None, description="WebSocket URL") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIRealtimeOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config: Dict[str, Any] = { + "vendor": "openai", + "api_key": self.options.api_key, + } + + if self.options.url is not None: + config["url"] = self.options.url + if ( + self.options.model is not None + or self.options.params is not None + or self.options.voice is not None + or self.options.instructions is not None + or self.options.input_audio_transcription is not None + ): + params: Dict[str, Any] = {} + if self.options.model is not None: + params["model"] = self.options.model + if self.options.params is not None: + params.update(self.options.params) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.input_audio_transcription is not None: + params["input_audio_transcription"] = self.options.input_audio_transcription + config["params"] = params + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + # xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + # is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + class XaiGrokOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="xAI API key") + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + class XaiGrok(BaseMLLM): + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + def __init__(self, **kwargs: Any): + self.options = XaiGrokOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.language is not None: + params["language"] = self.options.language + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate + + config: Dict[str, Any] = { + "vendor": "xai", + "api_key": self.options.api_key, + "url": self.options.url, + "params": params, + } + + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + model: str = Field(..., description="Model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud location/region") + adc_credentials_string: str = Field(..., description="Application Default Credentials JSON string") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name (e.g., Aoede, Charon)") + affective_dialog: Optional[bool] = Field(default=None, description="Enable affective dialog") + proactive_audio: Optional[bool] = Field(default=None, description="Enable proactive audio") + transcribe_agent: Optional[bool] = Field(default=None, description="Transcribe agent speech") + transcribe_user: Optional[bool] = Field(default=None, description="Transcribe user speech") + http_options: Optional[Dict[str, Any]] = Field(default=None, description="HTTP options") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # additional_params spread first so that explicit fields always win, + # matching the TypeScript SDK. + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["model"] = self.options.model + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.affective_dialog is not None: + params["affective_dialog"] = self.options.affective_dialog + if self.options.proactive_audio is not None: + params["proactive_audio"] = self.options.proactive_audio + if self.options.transcribe_agent is not None: + params["transcribe_agent"] = self.options.transcribe_agent + if self.options.transcribe_user is not None: + params["transcribe_user"] = self.options.transcribe_user + if self.options.http_options is not None: + params["http_options"] = self.options.http_options + + config: Dict[str, Any] = { + "vendor": "vertexai", + "project_id": self.options.project_id, + "location": self.options.location, + "adc_credentials_string": self.options.adc_credentials_string, + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class GeminiLiveOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google API key") + model: str = Field(..., description="Gemini Live model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name") + affective_dialog: Optional[bool] = Field(default=None, description="Enable affective dialog") + proactive_audio: Optional[bool] = Field(default=None, description="Enable proactive audio") + transcribe_agent: Optional[bool] = Field(default=None, description="Transcribe agent speech") + transcribe_user: Optional[bool] = Field(default=None, description="Transcribe user speech") + http_options: Optional[Dict[str, Any]] = Field(default=None, description="HTTP options") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiLiveOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = {} + if self.options.additional_params is not None: + params.update(self.options.additional_params) + params["model"] = self.options.model + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.affective_dialog is not None: + params["affective_dialog"] = self.options.affective_dialog + if self.options.proactive_audio is not None: + params["proactive_audio"] = self.options.proactive_audio + if self.options.transcribe_agent is not None: + params["transcribe_agent"] = self.options.transcribe_agent + if self.options.transcribe_user is not None: + params["transcribe_user"] = self.options.transcribe_user + if self.options.http_options is not None: + params["http_options"] = self.options.http_options + + config: Dict[str, Any] = { + "vendor": "gemini", + "api_key": self.options.api_key, + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + src/agora_agent/agentkit/vendors/stt.py: | + from typing import Any, Dict, Optional, Tuple + + from pydantic import BaseModel, ConfigDict, Field, model_validator + from typing_extensions import Literal + + from .base import BaseSTT + + TurnDetectionLanguage = Literal[ + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", + ] + + TURN_DETECTION_LANGUAGE_VALUES: Tuple[TurnDetectionLanguage, ...] = ( + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", + ) + _TURN_DETECTION_LANGUAGES = set(TURN_DETECTION_LANGUAGE_VALUES) + _DEEPGRAM_MANAGED_MODELS = {"nova-2", "nova-3"} + + + def _turn_detection_language(language: Optional[str]) -> Optional[TurnDetectionLanguage]: + if language in _TURN_DETECTION_LANGUAGES: + return language # type: ignore[return-value] + return None + + + class SpeechmaticsSTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Speechmatics API key") + language: str = Field(..., description="Language code (e.g., en, es, fr)") + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") + model: Optional[str] = Field(default=None, description="Model name") + uri: Optional[str] = Field(default=None, description="Speechmatics streaming WebSocket URL") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class SpeechmaticsSTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = SpeechmaticsSTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ + "api_key": self.options.api_key, + "language": self.options.language, + }) + if self.options.model is not None: + params["model"] = self.options.model + if self.options.uri is not None: + params["uri"] = self.options.uri + + config: Dict[str, Any] = { + "vendor": "speechmatics", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + + + class DeepgramSTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: Optional[str] = Field(default=None, description="Deepgram API key") + model: Optional[str] = Field(default=None, description="Model (e.g., nova-2, enhanced, base)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en-US)") + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") + smart_format: Optional[bool] = Field(default=None, description="Enable smart formatting") + punctuation: Optional[bool] = Field(default=None, description="Enable punctuation") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + @model_validator(mode="after") + def _validate_managed_model(self) -> "DeepgramSTTOptions": + if self.api_key is None and (self.model is None or self.model.strip().lower() not in _DEEPGRAM_MANAGED_MODELS): + raise ValueError("DeepgramSTT requires api_key unless using a supported Agora-managed model") + return self + + class DeepgramSTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = DeepgramSTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + + if self.options.api_key is not None: + params["key"] = self.options.api_key + if self.options.model is not None: + params["model"] = self.options.model + if self.options.language is not None: + params["language"] = self.options.language + if self.options.smart_format is not None: + params["smart_format"] = self.options.smart_format + if self.options.punctuation is not None: + params["punctuation"] = self.options.punctuation + config: Dict[str, Any] = { + "vendor": "deepgram", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + + + class MicrosoftSTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + key: str = Field(..., description="Azure subscription key") + region: str = Field(..., description="Azure region (e.g., eastus)") + language: str = Field(..., description="Language code (e.g., en-US)") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class MicrosoftSTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = MicrosoftSTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ + "key": self.options.key, + "region": self.options.region, + }) + if self.options.language is not None: + params["language"] = self.options.language + + config: Dict[str, Any] = { + "vendor": "microsoft", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + + + class OpenAISTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="OpenAI API key") + model: Optional[str] = Field(default=None, description="Model (default: whisper-1)") + language: Optional[str] = Field(default=None, description="Language code") + prompt: Optional[str] = Field(default=None, description="Prompt that guides OpenAI transcription") + input_audio_transcription: Optional[Dict[str, Any]] = Field(default=None, description="OpenAI transcription settings") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class OpenAISTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = OpenAISTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["api_key"] = self.options.api_key + + transcription = {"model": "whisper-1", **(self.options.input_audio_transcription or {})} + if self.options.model is not None: + transcription["model"] = self.options.model + if self.options.prompt is not None: + transcription["prompt"] = self.options.prompt + if self.options.language is not None: + transcription["language"] = self.options.language + params["input_audio_transcription"] = transcription + + config: Dict[str, Any] = { + "vendor": "openai", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + + + class GoogleSTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud region") + adc_credentials_string: str = Field(..., description="Google service account credentials JSON string") + language: str = Field(..., description="Language code (e.g., en-US)") + model: Optional[str] = Field(default=None, description="Recognition model") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class GoogleSTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = GoogleSTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ + "project_id": self.options.project_id, + "location": self.options.location, + "adc_credentials_string": self.options.adc_credentials_string, + }) + + if self.options.language is not None: + params["language"] = self.options.language + if self.options.model is not None: + params["model"] = self.options.model + + config: Dict[str, Any] = { + "vendor": "google", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + + + class AmazonSTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + access_key: str = Field(..., description="AWS Access Key ID") + secret_key: str = Field(..., description="AWS Secret Access Key") + region: str = Field(..., description="AWS region (e.g., us-east-1)") + language: str = Field(..., description="Language code") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class AmazonSTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = AmazonSTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ + "access_key_id": self.options.access_key, + "secret_access_key": self.options.secret_key, + "region": self.options.region, + }) + if self.options.language is not None: + params["language_code"] = self.options.language + + config: Dict[str, Any] = { + "vendor": "amazon", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + + + class AssemblyAISTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="AssemblyAI API key") + language: str = Field(..., description="Language code") + uri: Optional[str] = Field(default=None, description="AssemblyAI streaming WebSocket URL") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class AssemblyAISTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = AssemblyAISTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["api_key"] = self.options.api_key + if self.options.language is not None: + params["language"] = self.options.language + if self.options.uri is not None: + params["uri"] = self.options.uri + + config: Dict[str, Any] = { + "vendor": "assemblyai", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + + + class AresSTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + language: Optional[TurnDetectionLanguage] = Field(default=None, description="Language code") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class AresSTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = AresSTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config: Dict[str, Any] = {"vendor": "ares"} + if self.options.language is not None: + config["language"] = self.options.language + if self.options.additional_params: + config["params"] = self.options.additional_params + return config + + + class SarvamSTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Sarvam API key") + language: str = Field(..., description="Language code (e.g., en, hi, ta)") + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") + model: Optional[str] = Field(default=None, description="Model name") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class SarvamSTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = SarvamSTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ + "api_key": self.options.api_key, + "language": self.options.language, + }) + if self.options.model is not None: + params["model"] = self.options.model + + config: Dict[str, Any] = { + "vendor": "sarvam", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + - id: patch-617ee134 + content_hash: sha256:ea2d27ba8019bf09ce5766d322eb7218fcee0a90124e823ba16c4e45dc1af5a9 + original_commit: 617ee134d9dafbf4f4f83d5e98b80ad110c6e1bf + original_message: "feat(agentkit): support agent-level pipeline_id" + original_author: Hermes (agora) + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/reference/agent.md + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + - tests/custom/test_pipeline_id.py + patch_content: | + diff --git a/docs/reference/agent.md b/docs/reference/agent.md + index 187229f..86d4fbd 100644 + --- a/docs/reference/agent.md + +++ b/docs/reference/agent.md + @@ -27,12 +27,14 @@ Agent( + labels: Optional[Dict[str, str]] = None, + rtc: Optional[RtcConfig] = None, + filler_words: Optional[FillerWordsConfig] = None, + + pipeline_id: Optional[str] = None, + ) + ``` + + | Parameter | Type | Default | Description | + |---|---|---|---| + | `name` | `Optional[str]` | `None` | Agent name, used as default session name | + +| `pipeline_id` | `Optional[str]` | `None` | Published AI Studio pipeline ID used as this agent's base configuration | + | `instructions` | `Optional[str]` | `None` | Deprecated. Use LLM vendor `system_messages` instead. | + | `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Interaction language and turn detection configuration | + | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | + @@ -47,6 +49,8 @@ Agent( + | `rtc` | `Optional[RtcConfig]` | `None` | RTC media encryption | + | `filler_words` | `Optional[FillerWordsConfig]` | `None` | Filler words while waiting for LLM | + + +`pipeline_id` is an AI Studio base configuration. Explicit Agent config such as `with_llm()`, `with_tts()`, `with_stt()`, `with_mllm()`, `advanced_features`, and other builder options may send fields in `properties` that override the saved pipeline settings. Session-level `pipeline_id` overrides the agent-level value. + + + The Agent-level `instructions`, `greeting`, `failure_message`, `max_history`, and `greeting_configs` fields are compatibility shims. New code should configure those values on the LLM or MLLM vendor because that matches the core request schema. + + ## Builder Methods + @@ -202,6 +206,8 @@ create_session( + token: Optional[str] = None, + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + + preset: Optional[Union[str, Sequence[str]]] = None, + + pipeline_id: Optional[str] = None, + expires_in: Optional[int] = None, + ) -> AgentSession + ``` + @@ -219,6 +225,10 @@ Creates an `AgentSession` bound to the given client and channel. + | `expires_in` | `Optional[int]` | No | Token lifetime in seconds (default: `86400` = 24 h, Agora max). Only applies when the token is auto-generated. Use `expires_in_hours()` or `expires_in_minutes()` for clarity. Valid range: 1–86400. | + | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | + | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | + +| `preset` | `Optional[Union[str, Sequence[str]]]` | No | Advanced preset value for project-specific routing | + +| `pipeline_id` | `Optional[str]` | No | Published AI Studio pipeline ID for this session. Overrides `agent.pipeline_id`. | + + + +`pipeline_id` is sent as the top-level `/join` field `pipeline_id`, not inside `properties`. + + **Returns:** `AgentSession` + + diff --git a/docs/reference/session.md b/docs/reference/session.md + index 63402f6..76e1367 100644 + --- a/docs/reference/session.md + +++ b/docs/reference/session.md + @@ -33,6 +33,11 @@ AgentSession( + token: Optional[str] = None, + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + + preset: Optional[Union[str, Sequence[str]]] = None, + + pipeline_id: Optional[str] = None, + + expires_in: Optional[int] = None, + + debug: Optional[bool] = None, + + warn: Optional[Callable[[str], None]] = None, + ) + ``` + + @@ -51,6 +56,13 @@ AgentSession( + | `token` | `Optional[str]` | No | Pre-built RTC token | + | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | + | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | + +| `preset` | `Optional[Union[str, Sequence[str]]]` | No | Advanced preset value for project-specific routing | + +| `pipeline_id` | `Optional[str]` | No | Published AI Studio pipeline ID for this session. Overrides `agent.pipeline_id`. | + +| `expires_in` | `Optional[int]` | No | Auto-generated token lifetime in seconds | + +| `debug` | `Optional[bool]` | No | Enable debug logging of the start request | + +| `warn` | `Optional[Callable[[str], None]]` | No | Custom warning sink | + + + +`pipeline_id` is sent as the top-level `/join` field `pipeline_id`, not inside `properties`. If unset, `AgentSession.start()` uses the agent-level value from `Agent(..., pipeline_id=...)`. + + ## Methods + + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index fea1f0d..0a652db 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -343,8 +343,10 @@ class Agent: + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + greeting_configs: typing.Optional[LlmGreetingConfigs] = None, + + pipeline_id: typing.Optional[str] = None, + ): + self._name = name + + self._pipeline_id = pipeline_id + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + @@ -609,6 +611,11 @@ class Agent: + def name(self) -> typing.Optional[str]: + return self._name + + + @property + + def pipeline_id(self) -> typing.Optional[str]: + + """Published AI Studio pipeline ID used as this agent's base configuration.""" + + return self._pipeline_id + + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + @@ -693,6 +700,7 @@ class Agent: + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + + "pipeline_id": self._pipeline_id, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + @@ -945,6 +953,7 @@ class Agent: + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + + new_agent._pipeline_id = self._pipeline_id + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index e113dc1..5c866ac 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -52,7 +52,8 @@ class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + + Optional fields + --------------- + - app_certificate, token, idle_timeout, enable_string_uid, expires_in + + app_certificate, token, idle_timeout, enable_string_uid, preset, + + pipeline_id, expires_in, debug, warn + """ + + app_certificate: str + @@ -290,14 +291,18 @@ class _AgentSessionBase: + return True + return mllm is not None + + - def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + + def _build_start_properties( + + self, + + token_opts: typing.Dict[str, typing.Any], + + skip_vendor_validation: bool, + + ) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + - skip_vendor_validation=True, + + skip_vendor_validation=skip_vendor_validation, + **token_opts, + ) + properties = self._dump_model(base_properties) + @@ -445,6 +450,7 @@ class AgentSession(_AgentSessionBase): + self._status = "starting" + + try: + + pipeline_id = self._pipeline_id if self._pipeline_id is not None else self._agent.pipeline_id + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + @@ -454,7 +460,7 @@ class AgentSession(_AgentSessionBase): + "expires_in": self._expires_in, + } + + - properties = self._build_start_properties(token_opts) + + properties = self._build_start_properties(token_opts, skip_vendor_validation=bool(self._preset or pipeline_id)) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + @@ -466,7 +472,7 @@ class AgentSession(_AgentSessionBase): + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + - "pipeline_id": self._pipeline_id, + + "pipeline_id": pipeline_id, + "properties": resolved_properties, + }) + + @@ -480,7 +486,7 @@ class AgentSession(_AgentSessionBase): + name=self._name, + properties=request_properties, + preset=resolved_preset, + - pipeline_id=self._pipeline_id, + + pipeline_id=pipeline_id, + request_options=self._request_options(), + ) + + @@ -766,6 +772,7 @@ class AsyncAgentSession(_AgentSessionBase): + self._status = "starting" + + try: + + pipeline_id = self._pipeline_id if self._pipeline_id is not None else self._agent.pipeline_id + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + @@ -775,7 +782,7 @@ class AsyncAgentSession(_AgentSessionBase): + "expires_in": self._expires_in, + } + + - properties = self._build_start_properties(token_opts) + + properties = self._build_start_properties(token_opts, skip_vendor_validation=bool(self._preset or pipeline_id)) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + @@ -787,7 +794,7 @@ class AsyncAgentSession(_AgentSessionBase): + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + - "pipeline_id": self._pipeline_id, + + "pipeline_id": pipeline_id, + "properties": resolved_properties, + }) + + @@ -801,7 +808,7 @@ class AsyncAgentSession(_AgentSessionBase): + name=self._name, + properties=request_properties, + preset=resolved_preset, + - pipeline_id=self._pipeline_id, + + pipeline_id=pipeline_id, + request_options=self._request_options(), + ) + + diff --git a/tests/custom/test_pipeline_id.py b/tests/custom/test_pipeline_id.py + new file mode 100644 + index 0000000..c6c8c8f + --- /dev/null + +++ b/tests/custom/test_pipeline_id.py + @@ -0,0 +1,123 @@ + +import pytest + + + +from agora_agent import Agent + + + + + +def dump(value): + + if hasattr(value, "model_dump"): + + return value.model_dump(exclude_none=True) + + if hasattr(value, "dict"): + + return value.dict(exclude_none=True) + + return value + + + + + +class StartResponse: + + agent_id = "agent-id" + + + + + +class FakeAgentsClient: + + def __init__(self): + + self.calls = [] + + + + def start(self, appid, **kwargs): + + self.calls.append({"appid": appid, **kwargs}) + + return StartResponse() + + + + + +class FakeAsyncAgentsClient: + + def __init__(self): + + self.calls = [] + + + + async def start(self, appid, **kwargs): + + self.calls.append({"appid": appid, **kwargs}) + + return StartResponse() + + + + + +class FakeClient: + + app_id = "appid" + + app_certificate = None + + + + def __init__(self, agents): + + self.agents = agents + + + + + +def start_agent(agent, **overrides): + + agents = FakeAgentsClient() + + client = FakeClient(agents) + + options = { + + "channel": "channel", + + "token": "token", + + "agent_uid": "1", + + "remote_uids": ["100"], + + **overrides, + + } + + + + agent_id = agent.create_session(client, **options).start() + + + + assert agent_id == "agent-id" + + assert len(agents.calls) == 1 + + return agents.calls[0] + + + + + +def test_agent_pipeline_id_sends_top_level_pipeline_id() -> None: + + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + + + assert call["appid"] == "appid" + + assert call["name"] == "support" + + assert call["pipeline_id"] == "studio-pipeline-id" + + properties = dump(call["properties"]) + + assert properties["channel"] == "channel" + + assert properties["token"] == "token" + + assert properties["agent_rtc_uid"] == "1" + + assert properties["remote_rtc_uids"] == ["100"] + + + + + +def test_session_pipeline_id_overrides_agent_pipeline_id() -> None: + + call = start_agent( + + Agent(name="support", pipeline_id="agent-pipeline"), + + pipeline_id="session-pipeline", + + ) + + + + assert call["pipeline_id"] == "session-pipeline" + + + + + +def test_agent_pipeline_id_skips_missing_vendor_validation() -> None: + + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + + + assert call["pipeline_id"] == "studio-pipeline-id" + + + + + +def test_pipeline_id_is_not_sent_inside_properties() -> None: + + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + + + assert call["pipeline_id"] == "studio-pipeline-id" + + assert "pipeline_id" not in dump(call["properties"]) + + + + + +def test_pipeline_id_survives_builder_clone() -> None: + + agent = Agent(name="support", pipeline_id="studio-pipeline-id").with_tools(True) + + + + assert agent.pipeline_id == "studio-pipeline-id" + + call = start_agent(agent) + + + + assert call["pipeline_id"] == "studio-pipeline-id" + + assert dump(call["properties"])["advanced_features"] == {"enable_tools": True} + + + + + +@pytest.mark.asyncio + +async def test_async_session_uses_agent_pipeline_id() -> None: + + agents = FakeAsyncAgentsClient() + + client = FakeClient(agents) + + agent = Agent(name="support", pipeline_id="studio-pipeline-id") + + + + agent_id = await agent.create_async_session( + + client, + + channel="channel", + + token="token", + + agent_uid="1", + + remote_uids=["100"], + + ).start() + + + + assert agent_id == "agent-id" + + assert agents.calls[0]["pipeline_id"] == "studio-pipeline-id" + + assert "pipeline_id" not in dump(agents.calls[0]["properties"]) + theirs_snapshot: + docs/reference/agent.md: | + --- + sidebar_position: 2 + title: Agent + description: Full API reference for the Python Agent builder class. + --- + + # Agent Reference + + **Import:** `from agora_agent import Agent` + + ## Constructor + + + ```python + Agent( + name: Optional[str] = None, + instructions: Optional[str] = None, + turn_detection: Optional[TurnDetectionConfig] = None, + interruption: Optional[InterruptionConfig] = None, + sal: Optional[SalConfig] = None, + advanced_features: Optional[Dict[str, Any]] = None, + parameters: Optional[SessionParams] = None, + greeting: Optional[str] = None, + failure_message: Optional[str] = None, + max_history: Optional[int] = None, + geofence: Optional[GeofenceConfig] = None, + labels: Optional[Dict[str, str]] = None, + rtc: Optional[RtcConfig] = None, + filler_words: Optional[FillerWordsConfig] = None, + pipeline_id: Optional[str] = None, + ) + ``` + + | Parameter | Type | Default | Description | + |---|---|---|---| + | `name` | `Optional[str]` | `None` | Agent name, used as default session name | + | `pipeline_id` | `Optional[str]` | `None` | Published AI Studio pipeline ID used as this agent's base configuration | + | `instructions` | `Optional[str]` | `None` | Deprecated. Use LLM vendor `system_messages` instead. | + | `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Interaction language and turn detection configuration | + | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | + | `sal` | `Optional[SalConfig]` | `None` | Speech Activity Level configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | `None` | Advanced features dict (e.g., `{'enable_rtm': True}`) | + | `parameters` | `Optional[SessionParams]` | `None` | Additional session parameters | + | `greeting` | `Optional[str]` | `None` | Deprecated. Use LLM/MLLM vendor `greeting_message` instead. | + | `failure_message` | `Optional[str]` | `None` | Deprecated. Use LLM/MLLM vendor `failure_message` instead. | + | `max_history` | `Optional[int]` | `None` | Deprecated. Use LLM vendor `max_history` instead. | + | `geofence` | `Optional[GeofenceConfig]` | `None` | Regional access restriction | + | `labels` | `Optional[Dict[str, str]]` | `None` | Custom key-value labels (returned in callbacks) | + | `rtc` | `Optional[RtcConfig]` | `None` | RTC media encryption | + | `filler_words` | `Optional[FillerWordsConfig]` | `None` | Filler words while waiting for LLM | + + `pipeline_id` is an AI Studio base configuration. Explicit Agent config such as `with_llm()`, `with_tts()`, `with_stt()`, `with_mllm()`, `advanced_features`, and other builder options may send fields in `properties` that override the saved pipeline settings. Session-level `pipeline_id` overrides the agent-level value. + + The Agent-level `instructions`, `greeting`, `failure_message`, `max_history`, and `greeting_configs` fields are compatibility shims. New code should configure those values on the LLM or MLLM vendor because that matches the core request schema. + + ## Builder Methods + + All builder methods return a new `Agent` instance (immutable pattern). + + ### `with_llm(vendor: BaseLLM) -> Agent` + + Set the LLM vendor for cascading flow. + + + ```python + from agora_agent import OpenAI + agent = Agent().with_llm(OpenAI(api_key='your-key', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini')) + ``` + + ### `with_tts(vendor: BaseTTS) -> Agent` + + Set the TTS vendor. Records the vendor's `sample_rate` for avatar validation. + + + ```python + from agora_agent import ElevenLabsTTS + agent = Agent().with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1')) + ``` + + ### `with_stt(vendor: BaseSTT) -> Agent` + + Set the STT (ASR) vendor. + + + ```python + from agora_agent import DeepgramSTT + agent = Agent().with_stt(DeepgramSTT(api_key='your-key', language='en-US')) + ``` + + ### `with_mllm(vendor: BaseMLLM) -> Agent` + + Set the MLLM vendor for multimodal flow. Calling `with_mllm()` automatically sets `mllm.enable = True`. MLLM sessions do not require TTS, STT, or LLM vendors. + + + ```python + from agora_agent import OpenAIRealtime + agent = Agent().with_mllm(OpenAIRealtime(api_key='your-key')) + ``` + + ### `with_avatar(vendor: BaseAvatar) -> Agent` + + Set the avatar vendor for the cascading ASR + LLM + TTS pipeline. Avatars are not supported when MLLM is enabled — combining `with_mllm()` and an enabled `with_avatar()` is rejected at `to_properties()` and `AgentSession.start()`. A disabled avatar (`enable=False`) is allowed alongside MLLM. + + Raises `ValueError` if the TTS sample rate does not match the avatar's `required_sample_rate`. + + + ```python + from agora_agent import HeyGenAvatar + agent = agent.with_avatar(HeyGenAvatar(api_key='your-key', quality='medium', agora_uid='2')) + ``` + + **Raises:** `ValueError` — `"Avatar requires TTS sample rate of {required} Hz, but TTS is configured with {actual} Hz. Please update your TTS sample_rate to {required}."` + + ### `with_turn_detection(config: TurnDetectionConfig) -> Agent` + + Override cascading-flow turn detection settings. Use `language` for the Agora interaction language, `config.start_of_speech` and `config.end_of_speech` for SOS/EOS detection, `with_interruption()` for interruption behavior, and MLLM vendor `turn_detection` for MLLM turn detection. + + Pause-state detection is configured under semantic end-of-speech: + + ```python + agent = agent.with_turn_detection({ + "mode": "default", + "config": { + "end_of_speech": { + "mode": "semantic", + "semantic_config": { + "pause_state_enabled": True, + }, + }, + }, + }) + ``` + + ### `with_interruption(config: InterruptionConfig) -> Agent` + + Configure unified interruption behavior using the top-level `interruption` object. Use this for `start_of_speech` and `keywords` interruption modes. + + ### `with_instructions(instructions: str) -> Agent` + + Deprecated. Configure `system_messages` on the LLM vendor instead. + + ### `with_greeting(greeting: str) -> Agent` + + Deprecated. Configure `greeting_message` on the LLM or MLLM vendor instead. + + ### `with_name(name: str) -> Agent` + + Override the agent name. + + ### `with_sal(config: SalConfig) -> Agent` + + Set SAL (Selective Attention Locking) configuration. + + ### `with_advanced_features(features: AdvancedFeatures) -> Agent` + + Set advanced features (e.g. `{'enable_rtm': True}`). + + When `enable_rtm=True`, AgentKit defaults `parameters.data_channel` to `"rtm"` unless you explicitly set another data channel. + + ### `with_tools(enabled: bool = True) -> Agent` + + Enable or disable MCP tool invocation by setting `advanced_features.enable_tools`. + + ### `with_parameters(parameters: SessionParams) -> Agent` + + Set session parameters (silence config, farewell config, data channel, audio scenario, etc.). + + ### `with_audio_scenario(audio_scenario: ParametersAudioScenario) -> Agent` + + Set `parameters.audio_scenario` without replacing existing session parameters. + + ### `with_failure_message(message: str) -> Agent` + + Deprecated. Configure `failure_message` on the LLM or MLLM vendor instead. + + ### `with_max_history(max_history: int) -> Agent` + + Deprecated. Configure `max_history` on the LLM vendor instead. + + ### `with_geofence(geofence: GeofenceConfig) -> Agent` + + Set geofence configuration (restricts backend server regions). + + ### `with_labels(labels: Dict[str, str]) -> Agent` + + Set custom labels (key-value pairs returned in notification callbacks). + + ### `with_rtc(rtc: RtcConfig) -> Agent` + + Set RTC configuration. + + ### `with_filler_words(filler_words: FillerWordsConfig) -> Agent` + + Set filler words configuration (played while waiting for LLM response). + + ## `create_session()` + + + ```python + create_session( + client: Any, + channel: str, + agent_uid: str, + remote_uids: List[str], + name: Optional[str] = None, + token: Optional[str] = None, + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + preset: Optional[Union[str, Sequence[str]]] = None, + pipeline_id: Optional[str] = None, + expires_in: Optional[int] = None, + ) -> AgentSession + ``` + + Creates an `AgentSession` bound to the given client and channel. + + | Parameter | Type | Required | Description | + |---|---|---|---| + | `client` | `Agora` or `AsyncAgora` | Yes | Authenticated client | + | `channel` | `str` | Yes | Channel name | + | `agent_uid` | `str` | Yes | UID for the agent | + | `remote_uids` | `List[str]` | Yes | UIDs of remote participants | + | `name` | `Optional[str]` | No | Session name (defaults to agent name) | + | `token` | `Optional[str]` | No | Pre-built RTC+RTM token | + | `expires_in` | `Optional[int]` | No | Token lifetime in seconds (default: `86400` = 24 h, Agora max). Only applies when the token is auto-generated. Use `expires_in_hours()` or `expires_in_minutes()` for clarity. Valid range: 1–86400. | + | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | + | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | + | `preset` | `Optional[Union[str, Sequence[str]]]` | No | Advanced preset value for project-specific routing | + | `pipeline_id` | `Optional[str]` | No | Published AI Studio pipeline ID for this session. Overrides `agent.pipeline_id`. | + + `pipeline_id` is sent as the top-level `/join` field `pipeline_id`, not inside `properties`. + + **Returns:** `AgentSession` + + ## `to_properties()` + + Converts the agent configuration into a `StartAgentsRequestProperties` object for the Agora API. Called internally by `AgentSession.start()`. + + + ```python + to_properties( + channel: str, + agent_uid: str, + remote_uids: List[str], + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + token: Optional[str] = None, + app_id: Optional[str] = None, + app_certificate: Optional[str] = None, + expires_in: Optional[int] = None, + ) -> StartAgentsRequestProperties + ``` + + **Raises:** `ValueError` if neither `token` nor `app_id`+`app_certificate` is provided, or if required vendors (LLM, TTS) are missing in cascading mode. + + ## Properties + + | Property | Type | Description | + |---|---|---| + | `name` | `Optional[str]` | Agent name | + | `instructions` | `Optional[str]` | Deprecated Agent-level system prompt | + | `greeting` | `Optional[str]` | Deprecated Agent-level greeting message | + | `failure_message` | `Optional[str]` | Deprecated Agent-level failure message | + | `max_history` | `Optional[int]` | Deprecated Agent-level max history | + | `llm` | `Optional[Dict[str, Any]]` | LLM config dict (from `to_config()`) | + | `tts` | `Optional[Dict[str, Any]]` | TTS config dict | + | `stt` | `Optional[Dict[str, Any]]` | STT config dict | + | `mllm` | `Optional[Dict[str, Any]]` | MLLM config dict | + | `avatar` | `Optional[Dict[str, Any]]` | Avatar config dict | + | `turn_detection` | `Optional[TurnDetectionConfig]` | Interaction language and turn detection settings | + | `sal` | `Optional[SalConfig]` | SAL configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | Advanced features | + | `parameters` | `Optional[SessionParams]` | Session parameters | + | `geofence` | `Optional[GeofenceConfig]` | Geofence configuration | + | `labels` | `Optional[Dict[str, str]]` | Custom labels | + | `rtc` | `Optional[RtcConfig]` | RTC configuration | + | `filler_words` | `Optional[FillerWordsConfig]` | Filler words configuration | + | `config` | `Dict[str, Any]` | Full configuration dict | + + ## Type aliases + + Public aliases over Fern-generated types: `LlmConfig`, `SttConfig`, `AsrConfig` (= `SttConfig`), `MllmConfig`, `AvatarConfig`, session/conversation types, and think types (`ThinkOnListeningAction`, etc.). + + Think value constants: `ThinkOnListeningActionInject`, `ThinkOnListeningActionInterrupt`, `ThinkOnListeningActionIgnore`, `ThinkOnThinkingActionInterrupt`, `ThinkOnThinkingActionIgnore`, `ThinkOnSpeakingActionInterrupt`, `ThinkOnSpeakingActionIgnore`. + docs/reference/session.md: | + --- + sidebar_position: 3 + title: AgentSession + description: Full API reference for the Python AgentSession class. + --- + + # AgentSession / AsyncAgentSession Reference + + **Import:** + + ```python + from agora_agent import AgentSession + from agora_agent import AsyncAgentSession + # or from top-level: + from agora_agent import AgentSession, AsyncAgentSession + ``` + + ## Constructor + + Sessions are normally created via `Agent.create_session()`. Direct construction is available for advanced use: + + + ```python + AgentSession( + client: Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: List[str], + app_certificate: Optional[str] = None, + token: Optional[str] = None, + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + preset: Optional[Union[str, Sequence[str]]] = None, + pipeline_id: Optional[str] = None, + expires_in: Optional[int] = None, + debug: Optional[bool] = None, + warn: Optional[Callable[[str], None]] = None, + ) + ``` + + `AsyncAgentSession` has the same constructor signature. + + | Parameter | Type | Required | Description | + |---|---|---|---| + | `client` | `Agora` or `AsyncAgora` | Yes | Authenticated client | + | `agent` | `Agent` | Yes | Agent configuration | + | `app_id` | `str` | Yes | Agora App ID | + | `name` | `str` | Yes | Session name | + | `channel` | `str` | Yes | Channel name | + | `agent_uid` | `str` | Yes | UID for the agent | + | `remote_uids` | `List[str]` | Yes | UIDs of remote participants | + | `app_certificate` | `Optional[str]` | No | App Certificate (for auto token generation) | + | `token` | `Optional[str]` | No | Pre-built RTC token | + | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | + | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | + | `preset` | `Optional[Union[str, Sequence[str]]]` | No | Advanced preset value for project-specific routing | + | `pipeline_id` | `Optional[str]` | No | Published AI Studio pipeline ID for this session. Overrides `agent.pipeline_id`. | + | `expires_in` | `Optional[int]` | No | Auto-generated token lifetime in seconds | + | `debug` | `Optional[bool]` | No | Enable debug logging of the start request | + | `warn` | `Optional[Callable[[str], None]]` | No | Custom warning sink | + + `pipeline_id` is sent as the top-level `/join` field `pipeline_id`, not inside `properties`. If unset, `AgentSession.start()` uses the agent-level value from `Agent(..., pipeline_id=...)`. + + ## Methods + + ### `start()` + + Start the agent session. Generates an RTC token if not provided, validates avatar/TTS config for cascading sessions, and calls the Agora API. MLLM sessions do not require TTS; an enabled avatar is rejected when MLLM is configured (a disabled avatar is allowed). + + | | Sync (`AgentSession`) | Async (`AsyncAgentSession`) | + |---|---|---| + | **Signature** | `start() -> str` | `async start() -> str` | + | **Returns** | Agent ID | Agent ID | + | **Raises** | `RuntimeError` if not in `idle`, `stopped`, or `error` state | Same | + | **Raises** | `ValueError` if avatar/TTS sample rate mismatch or an enabled avatar is used with MLLM | Same | + + + ```python + # Sync + agent_id = session.start() + + # Async + agent_id = await session.start() + ``` + + ### `stop()` + + Stop the agent session. If the agent has already stopped (404 from API), transitions to `stopped` without raising. + + | | Sync | Async | + |---|---|---| + | **Signature** | `stop() -> None` | `async stop() -> None` | + | **Raises** | `RuntimeError` if not in `running` state | Same | + + + ```python + # Sync + session.stop() + + # Async + await session.stop() + ``` + + ### `say(text, priority=None, interruptable=None)` + + Send text to be spoken by the agent's TTS. + + | | Sync | Async | + |---|---|---| + | **Signature** | `say(text: str, priority: Optional[str] = None, interruptable: Optional[bool] = None) -> None` | Same with `async` | + | **Raises** | `RuntimeError` if not in `running` state | Same | + + | Parameter | Type | Required | Description | + |---|---|---|---| + | `text` | `str` | Yes | Text to speak | + | `priority` | `str` | No | `INTERRUPT`, `APPEND`, or `IGNORE` | + | `interruptable` | `bool` | No | Whether the message can be interrupted | + + + ```python + # Sync + session.say('Hello!', priority='INTERRUPT', interruptable=False) + + # Async + await session.say('Hello!', priority='INTERRUPT', interruptable=False) + ``` + + ### `interrupt()` + + Interrupt the agent while speaking or thinking. + + | | Sync | Async | + |---|---|---| + | **Signature** | `interrupt() -> None` | `async interrupt() -> None` | + | **Raises** | `RuntimeError` if not in `running` state | Same | + + + ```python + # Sync + session.interrupt() + + # Async + await session.interrupt() + ``` + + ### `update(properties)` + + Update the agent configuration at runtime. + + | | Sync | Async | + |---|---|---| + | **Signature** | `update(properties: Any) -> None` | `async update(properties: Any) -> None` | + | **Raises** | `RuntimeError` if not in `running` state | Same | + + + ```python + from agora_agent.agents.types import UpdateAgentsRequestProperties + + # Sync + session.update(properties) + + # Async + await session.update(properties) + ``` + + ### `think(text, ...)` + + Inject a custom text instruction into the running agent. + + In API v2.7, omitting `on_listening_action` uses the server default `interrupt`. Pass `on_listening_action='inject'` explicitly to preserve the pre-v2.7 behavior. + + ```python + session.think('Summarize the last answer', on_listening_action='inject') + ``` + + ### `get_history()` + + Retrieve the conversation history. + + | | Sync | Async | + |---|---|---| + | **Signature** | `get_history() -> Any` | `async get_history() -> Any` | + | **Raises** | `RuntimeError` if no agent ID | Same | + + + ```python + # Sync + history = session.get_history() + + # Async + history = await session.get_history() + ``` + + ### `get_info()` + + Retrieve the current session info. + + | | Sync | Async | + |---|---|---| + | **Signature** | `get_info() -> Any` | `async get_info() -> Any` | + | **Raises** | `RuntimeError` if no agent ID | Same | + + + ```python + # Sync + info = session.get_info() + + # Async + info = await session.get_info() + ``` + + ### `get_turns(page_index=None, page_size=None)` + + Retrieve paginated turn analytics for a completed or running session. In v2.7, the API defaults to page 1 and up to 50 turns per page. Responses include `agent_id`, `name`, `channel`, `total_turn_count`, `pagination`, and `turns`. + + ```python + page = session.get_turns(page_index=1, page_size=50) + ``` + + ### `get_all_turns(page_size=None)` + + Fetch all turn pages and return a single `GetTurnsAgentsResponse` with the combined `turns` list. + + ```python + all_turns = session.get_all_turns(page_size=50) + ``` + + ### `on(event, handler)` + + Register an event handler. This method is synchronous on both `AgentSession` and `AsyncAgentSession`. + + + ```python + session.on('started', lambda data: print(f'Started: {data}')) + ``` + + | Parameter | Type | Description | + |---|---|---| + | `event` | `str` | Event type: `started`, `stopped`, or `error` | + | `handler` | `Callable[..., None]` | Callback function | + + ### `off(event, handler)` + + Remove a previously registered event handler. + + + ```python + session.off('started', my_handler) + ``` + + ## Properties + + | Property | Type | Description | + |---|---|---| + | `id` | `Optional[str]` | Agent ID (set after `start()`) | + | `status` | `str` | Current state: `idle`, `starting`, `running`, `stopping`, `stopped`, `error` | + | `agent` | `Agent` | The agent configuration | + | `app_id` | `str` | Agora App ID | + | `raw` | `AgentsClient` / `AsyncAgentsClient` | Direct access to Fern-generated agents client | + + ## State Transitions + + | Current State | Allowed Actions | + |---|---| + | `idle` | `start()` | + | `starting` | (waiting for API) | + | `running` | `stop()`, `say()`, `interrupt()`, `update()`, `get_history()`, `get_info()` | + | `stopping` | (waiting for API) | + | `stopped` | `start()` (restart) | + | `error` | `start()` (retry) | + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.list_agents_response_data_list_item_status import ListAgentsResponseDataListItemStatus + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..types.asr import Asr + from ..types.llm import Llm + from ..types.llm_style import LlmStyle as GeneratedLlmStyle + from ..types.mllm import Mllm + from ..types.mllm_turn_detection import MllmTurnDetection + from ..types.mllm_turn_detection_mode import MllmTurnDetectionMode as GeneratedMllmTurnDetectionMode + from ..types.mllm_vendor import MllmVendor as GeneratedMllmVendor + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse, + ) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = Llm + LlmStyle = GeneratedLlmStyle + SttConfig = Asr + AsrConfig = SttConfig + SttVendor = typing.Any + TtsConfig = Tts + MllmConfig = Mllm + MllmVendor = GeneratedMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = MllmTurnDetection + MllmTurnDetectionMode = GeneratedMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + SessionStatus = ListAgentsResponseDataListItemStatus + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + + class ThinkOptions(typing_extensions.TypedDict, total=False): + on_listening_action: AgentThinkAgentManagementRequestOnListeningAction + on_thinking_action: AgentThinkAgentManagementRequestOnThinkingAction + on_speaking_action: AgentThinkAgentManagementRequestOnSpeakingAction + interruptable: bool + metadata: typing.Dict[str, str] + + + class GetTurnsOptions(typing_extensions.TypedDict, total=False): + page_index: int + page_size: int + + + class SayOptions(typing_extensions.TypedDict, total=False): + priority: SpeakAgentsRequestPriority + interruptable: bool + + + class SessionOptions(typing_extensions.TypedDict, total=False): + name: str + channel: str + token: str + agent_uid: str + remote_uids: typing.List[str] + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + # LLM sub-type aliases + LlmGreetingConfigs = typing.Dict[str, typing.Any] + LlmGreetingConfigsMode = typing.Any + McpServersItem = typing.Dict[str, typing.Any] + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + # Think type aliases and response + ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + ThinkResponse = AgentThinkAgentManagementResponse + + from .token import generate_convo_ai_token, _parse_numeric_uid, _validate_expires_in + + TurnDetectionLanguage = typing_extensions.Literal[ + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", + ] + + DEFAULT_TURN_DETECTION_LANGUAGE: TurnDetectionLanguage = "en-US" + TURN_DETECTION_LANGUAGE_VALUES: typing.Tuple[TurnDetectionLanguage, ...] = ( + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", + ) + _TURN_DETECTION_LANGUAGES = set(TURN_DETECTION_LANGUAGE_VALUES) + + + def _dump_optional_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + + def _is_turn_detection_language(value: typing.Any) -> bool: + return isinstance(value, str) and value in _TURN_DETECTION_LANGUAGES + + + def _validate_turn_detection_language(value: typing.Any) -> TurnDetectionLanguage: + if not _is_turn_detection_language(value): + raise ValueError(f"Invalid interaction language: {value}") + return value # type: ignore[return-value] + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Deprecated: + The Agent-level ``instructions``, ``greeting``, ``failure_message``, + ``max_history``, and ``greeting_configs`` convenience fields are kept + for compatibility. Configure those values on the LLM or MLLM vendor + instead. + + Examples + -------- + >>> from agora_agent import Agent, OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + greeting_configs: typing.Optional[LlmGreetingConfigs] = None, + pipeline_id: typing.Optional[str] = None, + ): + self._name = name + self._pipeline_id = pipeline_id + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + self._greeting_configs = greeting_configs + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + """Deprecated. Configure system messages on the LLM vendor instead.""" + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + """Deprecated. Configure the greeting on the LLM or MLLM vendor instead.""" + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_greeting_configs(self, configs: LlmGreetingConfigs) -> "Agent": + """Deprecated. Configure greeting playback on the LLM vendor instead.""" + new_agent = self._clone() + new_agent._greeting_configs = configs + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Deprecated. Configure the failure message on the LLM or MLLM vendor instead.""" + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Deprecated. Configure max history on the LLM vendor instead.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def pipeline_id(self) -> typing.Optional[str]: + """Published AI Studio pipeline ID used as this agent's base configuration.""" + return self._pipeline_id + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def greeting_configs(self) -> typing.Optional[LlmGreetingConfigs]: + return self._greeting_configs + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "pipeline_id": self._pipeline_id, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + "greeting_configs": self._greeting_configs, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + uid=_parse_numeric_uid(agent_uid, "agent_uid"), + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + base_kwargs["asr"] = self._resolve_asr_config() + base_kwargs["turn_detection"] = self._resolve_turn_detection_config() + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._greeting_configs is not None: + llm_config["greeting_configs"] = _dump_optional_model(self._greeting_configs) + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + + return StartAgentsRequestProperties(**base_kwargs) + + def _resolve_asr_config(self) -> typing.Dict[str, typing.Any]: + asr_config = dict(self._stt or {}) + asr_config.pop("language", None) + if not asr_config: + asr_config["vendor"] = "ares" + return asr_config + + def _resolve_turn_detection_config(self) -> TurnDetectionConfig: + existing_stt_language = self._stt.get("language") if self._stt is not None else None + existing_turn_detection_language = self._field_value(self._turn_detection, "language") + language = ( + existing_turn_detection_language + if existing_turn_detection_language is not None + else existing_stt_language + if _is_turn_detection_language(existing_stt_language) + else DEFAULT_TURN_DETECTION_LANGUAGE + ) + language = _validate_turn_detection_language(language) + if self._turn_detection is None: + return StartAgentsRequestPropertiesTurnDetection(language=language) + if isinstance(self._turn_detection, dict): + return typing.cast(TurnDetectionConfig, {**self._turn_detection, "language": language}) + return self._copy_model_update(self._turn_detection, {"language": language}) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._pipeline_id = self._pipeline_id + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + new_agent._greeting_configs = self._greeting_configs + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent, GetTurnsOptions, SayOptions, ThinkOptions + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token, _parse_numeric_uid + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, preset, + pipeline_id, expires_in, debug, warn + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(self._agent_uid, "agent_uid"), + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(str(params["agora_uid"]), "avatar agora_uid"), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties( + self, + token_opts: typing.Dict[str, typing.Any], + skip_vendor_validation: bool, + ) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=skip_vendor_validation, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.greeting_configs is not None: + llm["greeting_configs"] = self._dump_model(self._agent.greeting_configs) + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @staticmethod + def _response_pagination(response: typing.Any) -> typing.Any: + if isinstance(response, dict): + return response.get("pagination") + return getattr(response, "pagination", None) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> agent = agent.with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + pipeline_id = self._pipeline_id if self._pipeline_id is not None else self._agent.pipeline_id + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts, skip_vendor_validation=bool(self._preset or pipeline_id)) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> agent = agent.with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + pipeline_id = self._pipeline_id if self._pipeline_id is not None else self._agent.pipeline_id + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts, skip_vendor_validation=bool(self._preset or pipeline_id)) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + tests/custom/test_pipeline_id.py: | + import pytest + + from agora_agent import Agent + + + def dump(value): + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + + class StartResponse: + agent_id = "agent-id" + + + class FakeAgentsClient: + def __init__(self): + self.calls = [] + + def start(self, appid, **kwargs): + self.calls.append({"appid": appid, **kwargs}) + return StartResponse() + + + class FakeAsyncAgentsClient: + def __init__(self): + self.calls = [] + + async def start(self, appid, **kwargs): + self.calls.append({"appid": appid, **kwargs}) + return StartResponse() + + + class FakeClient: + app_id = "appid" + app_certificate = None + + def __init__(self, agents): + self.agents = agents + + + def start_agent(agent, **overrides): + agents = FakeAgentsClient() + client = FakeClient(agents) + options = { + "channel": "channel", + "token": "token", + "agent_uid": "1", + "remote_uids": ["100"], + **overrides, + } + + agent_id = agent.create_session(client, **options).start() + + assert agent_id == "agent-id" + assert len(agents.calls) == 1 + return agents.calls[0] + + + def test_agent_pipeline_id_sends_top_level_pipeline_id() -> None: + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + assert call["appid"] == "appid" + assert call["name"] == "support" + assert call["pipeline_id"] == "studio-pipeline-id" + properties = dump(call["properties"]) + assert properties["channel"] == "channel" + assert properties["token"] == "token" + assert properties["agent_rtc_uid"] == "1" + assert properties["remote_rtc_uids"] == ["100"] + + + def test_session_pipeline_id_overrides_agent_pipeline_id() -> None: + call = start_agent( + Agent(name="support", pipeline_id="agent-pipeline"), + pipeline_id="session-pipeline", + ) + + assert call["pipeline_id"] == "session-pipeline" + + + def test_agent_pipeline_id_skips_missing_vendor_validation() -> None: + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + assert call["pipeline_id"] == "studio-pipeline-id" + + + def test_pipeline_id_is_not_sent_inside_properties() -> None: + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + assert call["pipeline_id"] == "studio-pipeline-id" + assert "pipeline_id" not in dump(call["properties"]) + + + def test_pipeline_id_survives_builder_clone() -> None: + agent = Agent(name="support", pipeline_id="studio-pipeline-id").with_tools(True) + + assert agent.pipeline_id == "studio-pipeline-id" + call = start_agent(agent) + + assert call["pipeline_id"] == "studio-pipeline-id" + assert dump(call["properties"])["advanced_features"] == {"enable_tools": True} + + + @pytest.mark.asyncio + async def test_async_session_uses_agent_pipeline_id() -> None: + agents = FakeAsyncAgentsClient() + client = FakeClient(agents) + agent = Agent(name="support", pipeline_id="studio-pipeline-id") + + agent_id = await agent.create_async_session( + client, + channel="channel", + token="token", + agent_uid="1", + remote_uids=["100"], + ).start() + + assert agent_id == "agent-id" + assert agents.calls[0]["pipeline_id"] == "studio-pipeline-id" + assert "pipeline_id" not in dump(agents.calls[0]["properties"]) + status: unresolved + - id: patch-8e22e6d0 + content_hash: sha256:4baa4d46c129dde02b82a8367fdc1f9217d52267f82eb18f190d230d39a90927 + original_commit: 8e22e6d069e77f4c652e15f2f37945538c88c7c4 + original_message: udpated agent docs + original_author: Hermes (agora) + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/reference/agent.md + patch_content: |+ + From 8e22e6d069e77f4c652e15f2f37945538c88c7c4 Mon Sep 17 00:00:00 2001 + From: "Hermes (agora)" + Date: Tue, 2 Jun 2026 15:36:16 -0400 + Subject: [PATCH] udpated agent docs + + --- + docs/reference/agent.md | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + + diff --git a/docs/reference/agent.md b/docs/reference/agent.md + index 86d4fbd..5693e0b 100644 + --- a/docs/reference/agent.md + +++ b/docs/reference/agent.md + @@ -34,7 +34,6 @@ Agent( + | Parameter | Type | Default | Description | + |---|---|---|---| + | `name` | `Optional[str]` | `None` | Agent name, used as default session name | + -| `pipeline_id` | `Optional[str]` | `None` | Published AI Studio pipeline ID used as this agent's base configuration | + | `instructions` | `Optional[str]` | `None` | Deprecated. Use LLM vendor `system_messages` instead. | + | `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Interaction language and turn detection configuration | + | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | + @@ -48,6 +47,7 @@ Agent( + | `labels` | `Optional[Dict[str, str]]` | `None` | Custom key-value labels (returned in callbacks) | + | `rtc` | `Optional[RtcConfig]` | `None` | RTC media encryption | + | `filler_words` | `Optional[FillerWordsConfig]` | `None` | Filler words while waiting for LLM | + +| `pipeline_id` | `Optional[str]` | `None` | Published AI Studio pipeline ID used as this agent's base configuration | + + `pipeline_id` is an AI Studio base configuration. Explicit Agent config such as `with_llm()`, `with_tts()`, `with_stt()`, `with_mllm()`, `advanced_features`, and other builder options may send fields in `properties` that override the saved pipeline settings. Session-level `pipeline_id` overrides the agent-level value. + + -- + 2.52.0 + + theirs_snapshot: + docs/reference/agent.md: | + --- + sidebar_position: 2 + title: Agent + description: Full API reference for the Python Agent builder class. + --- + + # Agent Reference + + **Import:** `from agora_agent import Agent` + + ## Constructor + + + ```python + Agent( + name: Optional[str] = None, + instructions: Optional[str] = None, + turn_detection: Optional[TurnDetectionConfig] = None, + interruption: Optional[InterruptionConfig] = None, + sal: Optional[SalConfig] = None, + advanced_features: Optional[Dict[str, Any]] = None, + parameters: Optional[SessionParams] = None, + greeting: Optional[str] = None, + failure_message: Optional[str] = None, + max_history: Optional[int] = None, + geofence: Optional[GeofenceConfig] = None, + labels: Optional[Dict[str, str]] = None, + rtc: Optional[RtcConfig] = None, + filler_words: Optional[FillerWordsConfig] = None, + pipeline_id: Optional[str] = None, + ) + ``` + + | Parameter | Type | Default | Description | + |---|---|---|---| + | `name` | `Optional[str]` | `None` | Agent name, used as default session name | + | `instructions` | `Optional[str]` | `None` | Deprecated. Use LLM vendor `system_messages` instead. | + | `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Interaction language and turn detection configuration | + | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | + | `sal` | `Optional[SalConfig]` | `None` | Speech Activity Level configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | `None` | Advanced features dict (e.g., `{'enable_rtm': True}`) | + | `parameters` | `Optional[SessionParams]` | `None` | Additional session parameters | + | `greeting` | `Optional[str]` | `None` | Deprecated. Use LLM/MLLM vendor `greeting_message` instead. | + | `failure_message` | `Optional[str]` | `None` | Deprecated. Use LLM/MLLM vendor `failure_message` instead. | + | `max_history` | `Optional[int]` | `None` | Deprecated. Use LLM vendor `max_history` instead. | + | `geofence` | `Optional[GeofenceConfig]` | `None` | Regional access restriction | + | `labels` | `Optional[Dict[str, str]]` | `None` | Custom key-value labels (returned in callbacks) | + | `rtc` | `Optional[RtcConfig]` | `None` | RTC media encryption | + | `filler_words` | `Optional[FillerWordsConfig]` | `None` | Filler words while waiting for LLM | + | `pipeline_id` | `Optional[str]` | `None` | Published AI Studio pipeline ID used as this agent's base configuration | + + `pipeline_id` is an AI Studio base configuration. Explicit Agent config such as `with_llm()`, `with_tts()`, `with_stt()`, `with_mllm()`, `advanced_features`, and other builder options may send fields in `properties` that override the saved pipeline settings. Session-level `pipeline_id` overrides the agent-level value. + + The Agent-level `instructions`, `greeting`, `failure_message`, `max_history`, and `greeting_configs` fields are compatibility shims. New code should configure those values on the LLM or MLLM vendor because that matches the core request schema. + + ## Builder Methods + + All builder methods return a new `Agent` instance (immutable pattern). + + ### `with_llm(vendor: BaseLLM) -> Agent` + + Set the LLM vendor for cascading flow. + + + ```python + from agora_agent import OpenAI + agent = Agent().with_llm(OpenAI(api_key='your-key', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini')) + ``` + + ### `with_tts(vendor: BaseTTS) -> Agent` + + Set the TTS vendor. Records the vendor's `sample_rate` for avatar validation. + + + ```python + from agora_agent import ElevenLabsTTS + agent = Agent().with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1')) + ``` + + ### `with_stt(vendor: BaseSTT) -> Agent` + + Set the STT (ASR) vendor. + + + ```python + from agora_agent import DeepgramSTT + agent = Agent().with_stt(DeepgramSTT(api_key='your-key', language='en-US')) + ``` + + ### `with_mllm(vendor: BaseMLLM) -> Agent` + + Set the MLLM vendor for multimodal flow. Calling `with_mllm()` automatically sets `mllm.enable = True`. MLLM sessions do not require TTS, STT, or LLM vendors. + + + ```python + from agora_agent import OpenAIRealtime + agent = Agent().with_mllm(OpenAIRealtime(api_key='your-key')) + ``` + + ### `with_avatar(vendor: BaseAvatar) -> Agent` + + Set the avatar vendor for the cascading ASR + LLM + TTS pipeline. Avatars are not supported when MLLM is enabled — combining `with_mllm()` and an enabled `with_avatar()` is rejected at `to_properties()` and `AgentSession.start()`. A disabled avatar (`enable=False`) is allowed alongside MLLM. + + Raises `ValueError` if the TTS sample rate does not match the avatar's `required_sample_rate`. + + + ```python + from agora_agent import HeyGenAvatar + agent = agent.with_avatar(HeyGenAvatar(api_key='your-key', quality='medium', agora_uid='2')) + ``` + + **Raises:** `ValueError` — `"Avatar requires TTS sample rate of {required} Hz, but TTS is configured with {actual} Hz. Please update your TTS sample_rate to {required}."` + + ### `with_turn_detection(config: TurnDetectionConfig) -> Agent` + + Override cascading-flow turn detection settings. Use `language` for the Agora interaction language, `config.start_of_speech` and `config.end_of_speech` for SOS/EOS detection, `with_interruption()` for interruption behavior, and MLLM vendor `turn_detection` for MLLM turn detection. + + Pause-state detection is configured under semantic end-of-speech: + + ```python + agent = agent.with_turn_detection({ + "mode": "default", + "config": { + "end_of_speech": { + "mode": "semantic", + "semantic_config": { + "pause_state_enabled": True, + }, + }, + }, + }) + ``` + + ### `with_interruption(config: InterruptionConfig) -> Agent` + + Configure unified interruption behavior using the top-level `interruption` object. Use this for `start_of_speech` and `keywords` interruption modes. + + ### `with_instructions(instructions: str) -> Agent` + + Deprecated. Configure `system_messages` on the LLM vendor instead. + + ### `with_greeting(greeting: str) -> Agent` + + Deprecated. Configure `greeting_message` on the LLM or MLLM vendor instead. + + ### `with_name(name: str) -> Agent` + + Override the agent name. + + ### `with_sal(config: SalConfig) -> Agent` + + Set SAL (Selective Attention Locking) configuration. + + ### `with_advanced_features(features: AdvancedFeatures) -> Agent` + + Set advanced features (e.g. `{'enable_rtm': True}`). + + When `enable_rtm=True`, AgentKit defaults `parameters.data_channel` to `"rtm"` unless you explicitly set another data channel. + + ### `with_tools(enabled: bool = True) -> Agent` + + Enable or disable MCP tool invocation by setting `advanced_features.enable_tools`. + + ### `with_parameters(parameters: SessionParams) -> Agent` + + Set session parameters (silence config, farewell config, data channel, audio scenario, etc.). + + ### `with_audio_scenario(audio_scenario: ParametersAudioScenario) -> Agent` + + Set `parameters.audio_scenario` without replacing existing session parameters. + + ### `with_failure_message(message: str) -> Agent` + + Deprecated. Configure `failure_message` on the LLM or MLLM vendor instead. + + ### `with_max_history(max_history: int) -> Agent` + + Deprecated. Configure `max_history` on the LLM vendor instead. + + ### `with_geofence(geofence: GeofenceConfig) -> Agent` + + Set geofence configuration (restricts backend server regions). + + ### `with_labels(labels: Dict[str, str]) -> Agent` + + Set custom labels (key-value pairs returned in notification callbacks). + + ### `with_rtc(rtc: RtcConfig) -> Agent` + + Set RTC configuration. + + ### `with_filler_words(filler_words: FillerWordsConfig) -> Agent` + + Set filler words configuration (played while waiting for LLM response). + + ## `create_session()` + + + ```python + create_session( + client: Any, + channel: str, + agent_uid: str, + remote_uids: List[str], + name: Optional[str] = None, + token: Optional[str] = None, + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + preset: Optional[Union[str, Sequence[str]]] = None, + pipeline_id: Optional[str] = None, + expires_in: Optional[int] = None, + ) -> AgentSession + ``` + + Creates an `AgentSession` bound to the given client and channel. + + | Parameter | Type | Required | Description | + |---|---|---|---| + | `client` | `Agora` or `AsyncAgora` | Yes | Authenticated client | + | `channel` | `str` | Yes | Channel name | + | `agent_uid` | `str` | Yes | UID for the agent | + | `remote_uids` | `List[str]` | Yes | UIDs of remote participants | + | `name` | `Optional[str]` | No | Session name (defaults to agent name) | + | `token` | `Optional[str]` | No | Pre-built RTC+RTM token | + | `expires_in` | `Optional[int]` | No | Token lifetime in seconds (default: `86400` = 24 h, Agora max). Only applies when the token is auto-generated. Use `expires_in_hours()` or `expires_in_minutes()` for clarity. Valid range: 1–86400. | + | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | + | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | + | `preset` | `Optional[Union[str, Sequence[str]]]` | No | Advanced preset value for project-specific routing | + | `pipeline_id` | `Optional[str]` | No | Published AI Studio pipeline ID for this session. Overrides `agent.pipeline_id`. | + + `pipeline_id` is sent as the top-level `/join` field `pipeline_id`, not inside `properties`. + + **Returns:** `AgentSession` + + ## `to_properties()` + + Converts the agent configuration into a `StartAgentsRequestProperties` object for the Agora API. Called internally by `AgentSession.start()`. + + + ```python + to_properties( + channel: str, + agent_uid: str, + remote_uids: List[str], + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + token: Optional[str] = None, + app_id: Optional[str] = None, + app_certificate: Optional[str] = None, + expires_in: Optional[int] = None, + ) -> StartAgentsRequestProperties + ``` + + **Raises:** `ValueError` if neither `token` nor `app_id`+`app_certificate` is provided, or if required vendors (LLM, TTS) are missing in cascading mode. + + ## Properties + + | Property | Type | Description | + |---|---|---| + | `name` | `Optional[str]` | Agent name | + | `instructions` | `Optional[str]` | Deprecated Agent-level system prompt | + | `greeting` | `Optional[str]` | Deprecated Agent-level greeting message | + | `failure_message` | `Optional[str]` | Deprecated Agent-level failure message | + | `max_history` | `Optional[int]` | Deprecated Agent-level max history | + | `llm` | `Optional[Dict[str, Any]]` | LLM config dict (from `to_config()`) | + | `tts` | `Optional[Dict[str, Any]]` | TTS config dict | + | `stt` | `Optional[Dict[str, Any]]` | STT config dict | + | `mllm` | `Optional[Dict[str, Any]]` | MLLM config dict | + | `avatar` | `Optional[Dict[str, Any]]` | Avatar config dict | + | `turn_detection` | `Optional[TurnDetectionConfig]` | Interaction language and turn detection settings | + | `sal` | `Optional[SalConfig]` | SAL configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | Advanced features | + | `parameters` | `Optional[SessionParams]` | Session parameters | + | `geofence` | `Optional[GeofenceConfig]` | Geofence configuration | + | `labels` | `Optional[Dict[str, str]]` | Custom labels | + | `rtc` | `Optional[RtcConfig]` | RTC configuration | + | `filler_words` | `Optional[FillerWordsConfig]` | Filler words configuration | + | `config` | `Dict[str, Any]` | Full configuration dict | + + ## Type aliases + + Public aliases over Fern-generated types: `LlmConfig`, `SttConfig`, `AsrConfig` (= `SttConfig`), `MllmConfig`, `AvatarConfig`, session/conversation types, and think types (`ThinkOnListeningAction`, etc.). + + Think value constants: `ThinkOnListeningActionInject`, `ThinkOnListeningActionInterrupt`, `ThinkOnListeningActionIgnore`, `ThinkOnThinkingActionInterrupt`, `ThinkOnThinkingActionIgnore`, `ThinkOnSpeakingActionInterrupt`, `ThinkOnSpeakingActionIgnore`. + status: unresolved + - id: patch-bed29b6b + content_hash: sha256:8008d9c33a194a48ef317868953c26d5b03ede60c23743b4249260894c0f6417 + original_commit: bed29b6b7d4d08480a8510b26b5e21d1ef234cc9 + original_message: "chore: bump Python packages to 2.1.0" + original_author: digitallysavvy + base_generation: b66d871314ca0e5929cb9c9095949a7fd5e856a7 + files: + - compat/agora-agent-server-sdk/pyproject.toml + patch_content: | + diff --git a/compat/agora-agent-server-sdk/pyproject.toml b/compat/agora-agent-server-sdk/pyproject.toml + index ac93128..468294b 100644 + --- a/compat/agora-agent-server-sdk/pyproject.toml + +++ b/compat/agora-agent-server-sdk/pyproject.toml + @@ -3,7 +3,7 @@ name = "agora-agent-server-sdk" + + [tool.poetry] + name = "agora-agent-server-sdk" + -version = "v2.0.0" + +version = "v2.1.0" + description = "Compatibility shim for the renamed agora-agents package." + readme = "README.md" + authors = [] + @@ -35,7 +35,7 @@ Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-pyth + + [tool.poetry.dependencies] + python = "^3.8" + -agora-agents = ">=2.0.0,<3.0.0" + +agora-agents = ">=2.1.0,<3.0.0" + + [build-system] + requires = ["poetry-core"] + theirs_snapshot: + compat/agora-agent-server-sdk/pyproject.toml: | + [project] + name = "agora-agent-server-sdk" + + [tool.poetry] + name = "agora-agent-server-sdk" + version = "v2.1.0" + description = "Compatibility shim for the renamed agora-agents package." + readme = "README.md" + authors = [] + keywords = [] + + classifiers = [ + "Intended Audience :: Developers", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Operating System :: OS Independent", + "Operating System :: POSIX", + "Operating System :: MacOS", + "Operating System :: POSIX :: Linux", + "Operating System :: Microsoft :: Windows", + "Topic :: Software Development :: Libraries :: Python Modules", + "Typing :: Typed" + ] + packages = [ + { include = "agora_agent_server_sdk_compat", from = "src"} + ] + + [tool.poetry.urls] + Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python' + + [tool.poetry.dependencies] + python = "^3.8" + agora-agents = ">=2.1.0,<3.0.0" + + [build-system] + requires = ["poetry-core"] + build-backend = "poetry.core.masonry.api" + user_owned: true diff --git a/PYTHON-AGENTKIT-SNAKE-CASE-AUDIT.md b/PYTHON-AGENTKIT-SNAKE-CASE-AUDIT.md new file mode 100644 index 0000000..f3cd64a --- /dev/null +++ b/PYTHON-AGENTKIT-SNAKE-CASE-AUDIT.md @@ -0,0 +1,27 @@ +# Python AgentKit Snake Case API Audit + +Scope: `agora-agents-python` public AgentKit wrappers, docs, and tests. + +Search terms: + +```bash +rg -n "apiKey|baseUrl|modelId|voiceId|groupId|keyTerm|turnDetection|inputAudioTranscription|greetingMessage|failureMessage|projectId|adcCredentialsString|sampleRate|targetLanguageCode|resourceName|deploymentName" agora-agents-python +``` + +## Result + +No shipped camelCase public Python constructor kwargs were found in source or docs examples. No deprecated alias helper is required for this pass. + +| File | Class / symbol | Public arg or example | Current spelling | Desired Python spelling | `to_config()` key | Wire key | Action | Compatibility needed | Test coverage | +|---|---|---|---|---|---|---|---|---|---| +| `src/agora_agent/agentkit/vendors/tts.py` | `GoogleTTS` | constructor arg | `voice_name` | `voice_name` | `params.VoiceSelectionParams` | `params.VoiceSelectionParams` | keep | no | `tests/custom/test_tts_vendors.py` | +| `src/agora_agent/agentkit/vendors/tts.py` | `RimeTTS` | constructor arg | `model_id` | `model_id` | `params.modelId` | `params.modelId` | keep | no | `tests/custom/test_tts_vendors.py` | +| `src/agora_agent/agentkit/vendors/tts.py` | `MurfTTS` | constructor arg | `voice_id` | `voice_id` | `params.voiceId` | `params.voiceId` | keep | no | `tests/custom/test_tts_vendors.py`, `tests/custom/test_request_body.py` | +| `src/agora_agent/types/rime_tts_params.py` | generated model | generated alias | `modelId` | n/a | `model_id` | `modelId` | keep | no | `tests/custom/test_tts_vendors.py` | +| `src/agora_agent/types/murf_tts_params.py` | generated model | generated alias | `voiceId` | n/a | `voice_id` | `voiceId` | keep | no | `tests/custom/test_tts_vendors.py` | +| `tests/custom/test_request_body.py` | wire assertion | payload key | `voiceId` | n/a | `params.voiceId` | `params.voiceId` | keep | no | request-body test | +| `tests/custom/test_tts_vendors.py` | wire assertion | payload key | `modelId`, `voiceId`, `VoiceSelectionParams` | n/a | generated model fields | wire aliases | keep | no | wire serialization test | + +## Guardrail Added + +`tests/custom/test_docs_snake_case.py` scans Python markdown code fences and fails on common camelCase kwargs such as `apiKey`, `baseUrl`, `modelId`, `voiceId`, `projectId`, and `greetingMessage`. JSON, TypeScript, Go, shell, and YAML examples are skipped so wire payload examples can retain required non-Python keys. diff --git a/README.md b/README.md index c8cbabf..4dee35d 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ pip install agora-agents ## Quick Start Start with the `Agent` builder: create a client with app credentials, choose your ASR, LLM, and TTS providers, then start a session. Omit vendor API keys for supported Agora-managed models, or provide keys when you want BYOK. -Set Agora interaction language with `turn_detection.language`; provider-specific STT language values remain under `asr.params`. +Set Agora interaction language with `turn_detection.language`; provider-specific STT language values remain under `asr.params`. Ares uses only the REST `asr.language` value sourced from `turn_detection.language`. ```python import os diff --git a/changelog.md b/changelog.md index dc8dcc6..303d9fa 100644 --- a/changelog.md +++ b/changelog.md @@ -4,6 +4,26 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/). +## [v2.2.0] — 2026-06-05 + +### Added + +- **Expanded provider surface** — Added generated API support for the latest Conversational AI vendors and configuration types, including Dify LLM and Generic Avatar. +- **Interaction language handling** — AgentKit now consistently derives REST `asr.language` from `turn_detection.language` while keeping provider-specific STT language values under `asr.params`. +- **Deepgram keyterm** — Added `keyterm` support on `DeepgramSTT`, serialized as `asr.params.keyterm`. + +### Changed + +- **MiniMax managed presets** — MiniMax preset-backed TTS now keeps the preset model as an internal hint while sending only supported partial TTS settings such as `voice_setting.voice_id`. +- **Vertex AI LLM routing** — `VertexAILLM` now keeps project and location in the generated endpoint URL instead of duplicating them in `llm.params`. + +### Fixed + +- **Provider wire keys** — Corrected alias-sensitive TTS payloads so Google TTS emits `VoiceSelectionParams` and `AudioConfig`, Rime TTS emits `modelId`, and Murf TTS preserves `voiceId`. +- **AgentKit request validation** — Start request validation now de-aliases REST-shaped provider dictionaries before constructing generated request models, while still allowing preset and pipeline-backed partial configs. +- **Request body coverage** — Added regression tests for BYOK, preset-backed, mixed preset/BYOK, and pipeline override request shapes across provider configurations. +- **Python docs examples** — Added a docs guard to keep Python examples on snake_case kwargs while allowing documented JSON wire keys. + ## [v2.1.0] — 2026-06-02 ### Added @@ -21,7 +41,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). ### Fixed - **Managed-provider validation** — AgentKit validation now distinguishes preset-backed providers from BYOK providers so required provider fields are only required when credentials are caller-supplied. -- **Language placement** — Provider-specific STT language values remain under `asr.params`, while Agora interaction language is emitted separately as `turn_detection.language`. +- **Language placement** — Provider-specific STT language values remain under `asr.params`; the REST `asr.language` field is populated from `turn_detection.language`. ## [v2.0.0] — 2026-05-21 @@ -114,7 +134,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). ### Fixed -- **`AresSTT`** — Removed redundant `language` key from the `params` dict. Language is now emitted only at the top level. `params` is only included when `additional_params` is provided. +- **`AresSTT`** — Removed redundant `language` key from the `params` dict. Ares only selects the provider; AgentKit populates REST `asr.language` from `turn_detection.language`. `params` is only included when `additional_params` is provided. - **`OpenAIRealtime` / `VertexAI` (MLLM)** — Agent-level `greeting` and `failure_message` defaults are now correctly applied when missing in MLLM mode. Previously these values were silently dropped. - **`VertexAI` (MLLM)** — `messages` is emitted at the MLLM top level, matching the generated core SDK contract. diff --git a/compat/agora-agent-server-sdk/pyproject.toml b/compat/agora-agent-server-sdk/pyproject.toml index eea45d7..078ac75 100644 --- a/compat/agora-agent-server-sdk/pyproject.toml +++ b/compat/agora-agent-server-sdk/pyproject.toml @@ -3,7 +3,7 @@ name = "agora-agent-server-sdk" [tool.poetry] name = "agora-agent-server-sdk" -version = "v2.1.1" +version = "v2.2.0" description = "Compatibility shim for the renamed agora-agents package." readme = "README.md" authors = [] @@ -35,7 +35,7 @@ Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-pyth [tool.poetry.dependencies] python = "^3.8" -agora-agents = ">=2.1.1,<3.0.0" +agora-agents = ">=2.2.0,<3.0.0" [build-system] requires = ["poetry-core"] diff --git a/docs/concepts/vendors.md b/docs/concepts/vendors.md index c59ae7c..217b77d 100644 --- a/docs/concepts/vendors.md +++ b/docs/concepts/vendors.md @@ -75,12 +75,12 @@ tts = ElevenLabsTTS( Used with `agent.with_stt()`. -Use `turn_detection.language` for Agora interaction language; it defaults to `en-US`. STT vendor `language` options are serialized under `asr.params` using each provider's own format. +Use `turn_detection.language` for Agora interaction language; it defaults to `en-US`. STT vendor `language` options are serialized under `asr.params` using each provider's own format. Ares does not take a provider language option; AgentKit uses `turn_detection.language` for REST `asr.language`. | Class | Provider | Required Parameters | |---|---|---| | `SpeechmaticsSTT` | Speechmatics | `api_key`, `language` | -| `DeepgramSTT` | Deepgram | `model` for Agora-managed `nova-2`/`nova-3`; `api_key` for BYOK | +| `DeepgramSTT` | Deepgram | `model` for Agora-managed `nova-2`/`nova-3`; `api_key` for BYOK; `language?`, `keyterm?` | | `MicrosoftSTT` | Microsoft Azure | `key`, `region`, `language` | | `OpenAISTT` | OpenAI | `api_key` | | `GoogleSTT` | Google Cloud | `project_id`, `location`, `adc_credentials_string`, `language` | diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index cfa8580..1ab8aeb 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -318,7 +318,7 @@ The SDK also includes named helpers for the remaining Agora-supported LLM provid ## STT Vendors -Use `turn_detection.language` for Agora interaction language; it defaults to `en-US`. Provider-specific language values remain under `asr.params` and may use a different format. +Use `turn_detection.language` for Agora interaction language; it defaults to `en-US`. Provider-specific language values remain under `asr.params` and may use a different format. AgentKit populates REST `asr.language` from `turn_detection.language`. ### `SpeechmaticsSTT` @@ -336,6 +336,7 @@ Use `turn_detection.language` for Agora interaction language; it defaults to `en | `api_key` | `str` | BYOK only | `None` | Deepgram API key. Optional only for Agora-managed `nova-2` and `nova-3`. | | `model` | `str` | No | `None` | Model (e.g., `nova-2`) | | `language` | `str` | No | `None` | Language code (e.g., `en-US`) | +| `keyterm` | `str` | No | `None` | Boost specialized terms and brands; serialized as `asr.params.keyterm` | | `smart_format` | `bool` | No | `None` | Enable smart formatting | | `punctuation` | `bool` | No | `None` | Enable punctuation | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | @@ -396,7 +397,6 @@ For `nova-2` and `nova-3`, omit `api_key` to use Agora-managed credentials. For | Parameter | Type | Required | Default | Description | |---|---|---|---|---| -| `language` | `str` | No | `None` | Language code | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | ### `SarvamSTT` diff --git a/pyproject.toml b/pyproject.toml index f1e9e04..327306a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "agora-agents" [tool.poetry] name = "agora-agents" -version = "v2.1.1" +version = "v2.2.0" description = "" readme = "README.md" authors = [] diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index 6275f04..1daba82 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -76,6 +76,7 @@ from ..agent_management.types.agent_think_agent_management_response import ( AgentThinkAgentManagementResponse, ) +from ..core.pydantic_utilities import parse_obj_as from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS # Top-level aliases @@ -188,6 +189,13 @@ class SessionOptions(typing_extensions.TypedDict, total=False): debug: bool warn: typing.Callable[[str], None] + +def _start_properties_from_mapping( + properties: typing.Mapping[str, typing.Any], +) -> StartAgentsRequestProperties: + return parse_obj_as(StartAgentsRequestProperties, dict(properties)) + + # LLM sub-type aliases LlmGreetingConfigs = typing.Dict[str, typing.Any] LlmGreetingConfigsMode = typing.Any @@ -298,7 +306,7 @@ def _is_turn_detection_language(value: typing.Any) -> bool: def _validate_turn_detection_language(value: typing.Any) -> TurnDetectionLanguage: if not _is_turn_detection_language(value): - raise ValueError(f"Invalid interaction language: {value}") + raise ValueError(f"Invalid turn_detection.language: {value}") return value # type: ignore[return-value] @@ -896,7 +904,7 @@ def to_properties( if self._failure_message is not None: mllm_config.setdefault("failure_message", self._failure_message) base_kwargs["mllm"] = mllm_config - return StartAgentsRequestProperties(**base_kwargs) + return _start_properties_from_mapping(base_kwargs) if skip_vendor_validation: warnings.warn( @@ -919,12 +927,13 @@ def to_properties( allow_missing_llm = "llm" in allow_missing_categories allow_missing_tts = "tts" in allow_missing_categories + turn_detection_config = self._resolve_turn_detection_config() if not skip_asr_validation and (self._stt is not None or not allow_missing_asr): - base_kwargs["asr"] = self._resolve_asr_config() - base_kwargs["turn_detection"] = self._resolve_turn_detection_config() + base_kwargs["asr"] = self._resolve_asr_config(turn_detection_config) + base_kwargs["turn_detection"] = turn_detection_config if skip_vendor_validation: - return StartAgentsRequestProperties(**base_kwargs) + return _start_properties_from_mapping(base_kwargs) if self._tts is None and not (skip_tts_validation or allow_missing_tts): raise ValueError("TTS configuration is required. Use with_tts() to set it.") @@ -937,39 +946,34 @@ def to_properties( if self._tts is not None and not skip_tts_validation: base_kwargs["tts"] = self._tts - return StartAgentsRequestProperties(**base_kwargs) + return _start_properties_from_mapping(base_kwargs) def _resolve_llm_config(self) -> typing.Dict[str, typing.Any]: llm_config = dict(self._llm or {}) - # Agent-level fields take priority over the vendor's defaults. - # This matches the TS SDK where agent-level values override vendor config. - if self._instructions is not None: + if self._instructions is not None and "system_messages" not in llm_config: llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] - if self._greeting is not None: + if self._greeting is not None and "greeting_message" not in llm_config: llm_config["greeting_message"] = self._greeting - if self._greeting_configs is not None: + if self._greeting_configs is not None and "greeting_configs" not in llm_config: llm_config["greeting_configs"] = _dump_optional_model(self._greeting_configs) - if self._failure_message is not None: + if self._failure_message is not None and "failure_message" not in llm_config: llm_config["failure_message"] = self._failure_message - if self._max_history is not None: + if self._max_history is not None and "max_history" not in llm_config: llm_config["max_history"] = self._max_history return llm_config - def _resolve_asr_config(self) -> typing.Dict[str, typing.Any]: + def _resolve_asr_config(self, turn_detection_config: TurnDetectionConfig) -> typing.Dict[str, typing.Any]: asr_config = dict(self._stt or {}) - asr_config.pop("language", None) if not asr_config: asr_config["vendor"] = "ares" + asr_config["language"] = self._field_value(turn_detection_config, "language") return asr_config def _resolve_turn_detection_config(self) -> TurnDetectionConfig: - existing_stt_language = self._stt.get("language") if self._stt is not None else None existing_turn_detection_language = self._field_value(self._turn_detection, "language") language = ( existing_turn_detection_language if existing_turn_detection_language is not None - else existing_stt_language - if _is_turn_detection_language(existing_stt_language) else DEFAULT_TURN_DETECTION_LANGUAGE ) language = _validate_turn_detection_language(language) diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py index dbff562..2900c18 100644 --- a/src/agora_agent/agentkit/agent_session.py +++ b/src/agora_agent/agentkit/agent_session.py @@ -15,8 +15,7 @@ AgentThinkAgentManagementResponse as AgentThinkResponse, ) from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse -from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties -from .agent import Agent, GetTurnsOptions, SayOptions, ThinkOptions +from .agent import Agent, GetTurnsOptions, SayOptions, ThinkOptions, _start_properties_from_mapping from .avatar_types import ( is_akool_avatar, is_anam_avatar, @@ -24,6 +23,7 @@ is_generic_avatar, is_heygen_avatar, is_live_avatar_avatar, + is_rtc_avatar, validate_avatar_config, validate_tts_sample_rate, ) @@ -333,15 +333,15 @@ def _build_start_properties( properties["tts"] = self._dump_model(self._agent.tts) if self._agent.llm is not None: llm = dict(self._agent.llm) - if self._agent.instructions is not None: + if self._agent.instructions is not None and "system_messages" not in llm: llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] - if self._agent.greeting is not None: + if self._agent.greeting is not None and "greeting_message" not in llm: llm["greeting_message"] = self._agent.greeting - if self._agent.greeting_configs is not None: + if self._agent.greeting_configs is not None and "greeting_configs" not in llm: llm["greeting_configs"] = self._dump_model(self._agent.greeting_configs) - if self._agent.failure_message is not None: + if self._agent.failure_message is not None and "failure_message" not in llm: llm["failure_message"] = self._agent.failure_message - if self._agent.max_history is not None: + if self._agent.max_history is not None and "max_history" not in llm: llm["max_history"] = self._agent.max_history properties["llm"] = llm if self._agent.stt is not None: @@ -349,6 +349,47 @@ def _build_start_properties( return properties + @staticmethod + def _request_properties_for_start( + resolved_properties: typing.Dict[str, typing.Any], + *, + resolved_preset: typing.Optional[str], + pipeline_id: typing.Optional[str], + ) -> typing.Any: + try: + return _start_properties_from_mapping(resolved_properties) + except Exception as exc: + if pipeline_id: + return resolved_properties + if resolved_preset: + normalized_preset = normalize_preset_input(resolved_preset) + if not normalized_preset: + raise + preset_categories = { + category + for item in normalized_preset.split(",") + for category in [get_preset_category(item)] + if category is not None + } + error_categories = _AgentSessionBase._validation_error_categories(exc) + if error_categories and error_categories.issubset(preset_categories): + return resolved_properties + raise + + @staticmethod + def _validation_error_categories(exc: Exception) -> typing.Set[str]: + errors = getattr(exc, "errors", None) + if not callable(errors): + return set() + categories: typing.Set[str] = set() + for error in errors(): + loc = error.get("loc") if isinstance(error, dict) else None + if isinstance(loc, tuple) and loc: + field = loc[0] + if field in {"asr", "llm", "tts"}: + categories.add(typing.cast(str, field)) + return categories + def _vendor_validation_categories( self, pipeline_id: typing.Optional[str], @@ -513,10 +554,11 @@ def start(self) -> str: "properties": resolved_properties, }) - try: - request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) - except Exception: - request_properties = resolved_properties + request_properties = self._request_properties_for_start( + resolved_properties, + resolved_preset=resolved_preset, + pipeline_id=pipeline_id, + ) response = self._client.agents.start( self._app_id, @@ -840,10 +882,11 @@ async def start(self) -> str: "properties": resolved_properties, }) - try: - request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) - except Exception: - request_properties = resolved_properties + request_properties = self._request_properties_for_start( + resolved_properties, + resolved_preset=resolved_preset, + pipeline_id=pipeline_id, + ) response = await self._client.agents.start( self._app_id, diff --git a/src/agora_agent/agentkit/presets.py b/src/agora_agent/agentkit/presets.py index 68d27df..f160cee 100644 --- a/src/agora_agent/agentkit/presets.py +++ b/src/agora_agent/agentkit/presets.py @@ -108,7 +108,7 @@ def infer_asr_preset(asr: typing.Optional[typing.Dict[str, typing.Any]]) -> typi if not asr or asr.get("vendor") != "deepgram": return None params = asr.get("params") or {} - if params.get("api_key"): + if params.get("key"): return None return _DEEPGRAM_MODEL_TO_PRESET.get(_normalize_model_name(params.get("model")) or "") @@ -137,7 +137,9 @@ def infer_tts_preset(tts: typing.Optional[typing.Dict[str, typing.Any]]) -> typi if vendor == "minimax": if params.get("key"): return None - return _MINIMAX_MODEL_TO_PRESET.get(_normalize_model_name(params.get("model")) or "") + # Model is no longer in params for the preset path; fall back to the top-level hint. + model = _normalize_model_name(params.get("model")) or _normalize_model_name(tts.get("_minimax_preset_model")) or "" + return _MINIMAX_MODEL_TO_PRESET.get(model) return None @@ -184,6 +186,9 @@ def strip_inferred_preset_fields(properties: typing.Dict[str, typing.Any], infer params["group_id"] = None params["url"] = None tts = {k: v for k, v in {**tts, "params": _omit_none(params)}.items() if v is not None} + tts.pop("_minimax_preset_model", None) + if tts and "_minimax_preset_model" in tts: + tts = {k: v for k, v in tts.items() if k != "_minimax_preset_model"} return {**properties, "asr": asr, "llm": llm, "tts": tts} diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py index 50bdd08..1bd9633 100644 --- a/src/agora_agent/agentkit/vendors/avatar.py +++ b/src/agora_agent/agentkit/vendors/avatar.py @@ -206,3 +206,4 @@ def to_config(self) -> Dict[str, Any]: enable = self.options.enable if self.options.enable is not None else True return {"enable": enable, "vendor": "anam", "params": params} + diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py index 9156a01..5a9f39e 100644 --- a/src/agora_agent/agentkit/vendors/llm.py +++ b/src/agora_agent/agentkit/vendors/llm.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union from pydantic import BaseModel, ConfigDict, Field, model_validator @@ -376,12 +376,13 @@ def to_config(self) -> Dict[str, Any]: options = _dump_optional_model(self.options) options.pop("project_id", None) options.pop("location", None) - config = Gemini(**options).to_config() - params = dict(config["params"]) - params["project_id"] = self.options.project_id - params["location"] = self.options.location - config["params"] = params - return config + if not options.get("url"): + options["url"] = ( + f"https://{self.options.location}-aiplatform.googleapis.com/v1/projects/" + f"{self.options.project_id}/locations/{self.options.location}/" + f"publishers/google/models/{self.options.model}:streamGenerateContent?alt=sse" + ) + return Gemini(**options).to_config() class AmazonBedrockOptions(BaseModel): diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py index 236a494..6a260d8 100644 --- a/src/agora_agent/agentkit/vendors/mllm.py +++ b/src/agora_agent/agentkit/vendors/mllm.py @@ -1,3 +1,4 @@ +import warnings from typing import Any, Dict, List, Optional from pydantic import BaseModel, ConfigDict, Field diff --git a/src/agora_agent/agentkit/vendors/stt.py b/src/agora_agent/agentkit/vendors/stt.py index e5117b0..d390573 100644 --- a/src/agora_agent/agentkit/vendors/stt.py +++ b/src/agora_agent/agentkit/vendors/stt.py @@ -1,89 +1,12 @@ -from typing import Any, Dict, Optional, Tuple +from typing import Any, Dict, Optional from pydantic import BaseModel, ConfigDict, Field, model_validator -from typing_extensions import Literal from .base import BaseSTT -TurnDetectionLanguage = Literal[ - "ar-EG", - "ar-JO", - "ar-SA", - "ar-AE", - "bn-IN", - "zh-CN", - "zh-HK", - "zh-TW", - "nl-NL", - "en-IN", - "en-US", - "fil-PH", - "fr-FR", - "de-DE", - "gu-IN", - "he-IL", - "hi-IN", - "id-ID", - "it-IT", - "ja-JP", - "kn-IN", - "ko-KR", - "ms-MY", - "fa-IR", - "pt-PT", - "ru-RU", - "es-ES", - "ta-IN", - "te-IN", - "th-TH", - "tr-TR", - "vi-VN", -] - -TURN_DETECTION_LANGUAGE_VALUES: Tuple[TurnDetectionLanguage, ...] = ( - "ar-EG", - "ar-JO", - "ar-SA", - "ar-AE", - "bn-IN", - "zh-CN", - "zh-HK", - "zh-TW", - "nl-NL", - "en-IN", - "en-US", - "fil-PH", - "fr-FR", - "de-DE", - "gu-IN", - "he-IL", - "hi-IN", - "id-ID", - "it-IT", - "ja-JP", - "kn-IN", - "ko-KR", - "ms-MY", - "fa-IR", - "pt-PT", - "ru-RU", - "es-ES", - "ta-IN", - "te-IN", - "th-TH", - "tr-TR", - "vi-VN", -) -_TURN_DETECTION_LANGUAGES = set(TURN_DETECTION_LANGUAGE_VALUES) _DEEPGRAM_MANAGED_MODELS = {"nova-2", "nova-3"} -def _turn_detection_language(language: Optional[str]) -> Optional[TurnDetectionLanguage]: - if language in _TURN_DETECTION_LANGUAGES: - return language # type: ignore[return-value] - return None - - class SpeechmaticsSTTOptions(BaseModel): model_config = ConfigDict(extra="forbid") @@ -112,9 +35,6 @@ def to_config(self) -> Dict[str, Any]: "vendor": "speechmatics", "params": params, } - turn_detection_language = _turn_detection_language(self.options.language) - if turn_detection_language is not None: - config["language"] = turn_detection_language return config @@ -124,6 +44,7 @@ class DeepgramSTTOptions(BaseModel): api_key: Optional[str] = Field(default=None, description="Deepgram API key") model: Optional[str] = Field(default=None, description="Model (e.g., nova-2, enhanced, base)") language: Optional[str] = Field(default=None, description="Language code (e.g., en-US)") + keyterm: Optional[str] = Field(default=None, description="Boost specialized terms and brands for Deepgram") smart_format: Optional[bool] = Field(default=None, description="Enable smart formatting") punctuation: Optional[bool] = Field(default=None, description="Enable punctuation") additional_params: Optional[Dict[str, Any]] = Field(default=None) @@ -151,13 +72,12 @@ def to_config(self) -> Dict[str, Any]: params["smart_format"] = self.options.smart_format if self.options.punctuation is not None: params["punctuation"] = self.options.punctuation + if self.options.keyterm is not None: + params["keyterm"] = self.options.keyterm config: Dict[str, Any] = { "vendor": "deepgram", "params": params, } - turn_detection_language = _turn_detection_language(self.options.language) - if turn_detection_language is not None: - config["language"] = turn_detection_language return config @@ -186,9 +106,6 @@ def to_config(self) -> Dict[str, Any]: "vendor": "microsoft", "params": params, } - turn_detection_language = _turn_detection_language(self.options.language) - if turn_detection_language is not None: - config["language"] = turn_detection_language return config @@ -210,22 +127,26 @@ def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = dict(self.options.additional_params or {}) params["api_key"] = self.options.api_key - transcription = {"model": "whisper-1", **(self.options.input_audio_transcription or {})} + transcription: Dict[str, Any] = {"model": "gpt-4o-mini-transcribe"} + transcription.update(self.options.input_audio_transcription or {}) if self.options.model is not None: transcription["model"] = self.options.model if self.options.prompt is not None: transcription["prompt"] = self.options.prompt if self.options.language is not None: transcription["language"] = self.options.language + if not transcription.get("model"): + raise ValueError("OpenAISTT: input_audio_transcription.model is required") + if not transcription.get("prompt"): + raise ValueError("OpenAISTT: input_audio_transcription.prompt is required") + if not transcription.get("language"): + raise ValueError("OpenAISTT: input_audio_transcription.language is required") params["input_audio_transcription"] = transcription config: Dict[str, Any] = { "vendor": "openai", "params": params, } - turn_detection_language = _turn_detection_language(self.options.language) - if turn_detection_language is not None: - config["language"] = turn_detection_language return config @@ -260,9 +181,6 @@ def to_config(self) -> Dict[str, Any]: "vendor": "google", "params": params, } - turn_detection_language = _turn_detection_language(self.options.language) - if turn_detection_language is not None: - config["language"] = turn_detection_language return config @@ -293,9 +211,6 @@ def to_config(self) -> Dict[str, Any]: "vendor": "amazon", "params": params, } - turn_detection_language = _turn_detection_language(self.options.language) - if turn_detection_language is not None: - config["language"] = turn_detection_language return config @@ -323,16 +238,12 @@ def to_config(self) -> Dict[str, Any]: "vendor": "assemblyai", "params": params, } - turn_detection_language = _turn_detection_language(self.options.language) - if turn_detection_language is not None: - config["language"] = turn_detection_language return config class AresSTTOptions(BaseModel): model_config = ConfigDict(extra="forbid") - language: Optional[TurnDetectionLanguage] = Field(default=None, description="Language code") additional_params: Optional[Dict[str, Any]] = Field(default=None) class AresSTT(BaseSTT): @@ -341,8 +252,6 @@ def __init__(self, **kwargs: Any): def to_config(self) -> Dict[str, Any]: config: Dict[str, Any] = {"vendor": "ares"} - if self.options.language is not None: - config["language"] = self.options.language if self.options.additional_params: config["params"] = self.options.additional_params return config @@ -373,7 +282,4 @@ def to_config(self) -> Dict[str, Any]: "vendor": "sarvam", "params": params, } - turn_detection_language = _turn_detection_language(self.options.language) - if turn_detection_language is not None: - config["language"] = turn_detection_language return config diff --git a/src/agora_agent/agentkit/vendors/tts.py b/src/agora_agent/agentkit/vendors/tts.py index a052ea5..acfec78 100644 --- a/src/agora_agent/agentkit/vendors/tts.py +++ b/src/agora_agent/agentkit/vendors/tts.py @@ -114,7 +114,7 @@ def _validate_byok_params(self) -> "OpenAITTSOptions": ("model", self.model), ("base_url", self.base_url), ) - if value is None + if not value ] if missing: raise ValueError(f"OpenAITTS requires {', '.join(missing)} when api_key is set") @@ -436,17 +436,20 @@ def sample_rate(self) -> Optional[int]: return None def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = {"model": self.options.model} + params: Dict[str, Any] = {} if self.options.key is not None: params["key"] = self.options.key - if self.options.group_id is not None: params["group_id"] = self.options.group_id + params["model"] = self.options.model + params["url"] = self.options.url if self.options.voice_id is not None: params["voice_setting"] = {"voice_id": self.options.voice_id} - if self.options.url is not None: - params["url"] = self.options.url result: Dict[str, Any] = {"vendor": "minimax", "params": params} + if self.options.key is None: + # Preset path: model not in params; stored as top-level hint for preset + # inference. Stripped by strip_inferred_preset_fields before the POST body. + result["_minimax_preset_model"] = self.options.model if self.options.skip_patterns is not None: result["skip_patterns"] = self.options.skip_patterns return result diff --git a/src/agora_agent/core/client_wrapper.py b/src/agora_agent/core/client_wrapper.py index a8efe07..ba5e462 100644 --- a/src/agora_agent/core/client_wrapper.py +++ b/src/agora_agent/core/client_wrapper.py @@ -26,10 +26,10 @@ def __init__( def get_headers(self) -> typing.Dict[str, str]: headers: typing.Dict[str, str] = { - "User-Agent": "agora-agents/v2.1.1", + "User-Agent": "agora-agents/v2.2.0", "X-Fern-Language": "Python", "X-Fern-SDK-Name": "agora-agents", - "X-Fern-SDK-Version": "v2.1.1", + "X-Fern-SDK-Version": "v2.2.0", **(self.get_custom_headers() or {}), } headers["Authorization"] = httpx.BasicAuth(self._get_username(), self._get_password())._auth_header diff --git a/src/agora_agent/types/asr.py b/src/agora_agent/types/asr.py index f08086f..1f2225d 100644 --- a/src/agora_agent/types/asr.py +++ b/src/agora_agent/types/asr.py @@ -54,7 +54,6 @@ class Asr_Deepgram(UncheckedBaseModel): vendor: typing.Literal["deepgram"] = "deepgram" language: typing.Optional[AsrLanguage] = None params: DeepgramAsrParams - keyterm: typing.Optional[str] = None if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 diff --git a/src/agora_agent/types/deepgram_asr.py b/src/agora_agent/types/deepgram_asr.py index 1c79c7b..723cd86 100644 --- a/src/agora_agent/types/deepgram_asr.py +++ b/src/agora_agent/types/deepgram_asr.py @@ -16,10 +16,6 @@ class DeepgramAsr(UncheckedBaseModel): language: typing.Optional[AsrLanguage] = None params: DeepgramAsrParams - keyterm: typing.Optional[str] = pydantic.Field(default=None) - """ - Boost specialized terms and brands for preset-backed Deepgram usage. - """ if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 diff --git a/src/agora_agent/types/deepgram_asr_params.py b/src/agora_agent/types/deepgram_asr_params.py index 259958e..6688333 100644 --- a/src/agora_agent/types/deepgram_asr_params.py +++ b/src/agora_agent/types/deepgram_asr_params.py @@ -34,7 +34,7 @@ class DeepgramAsrParams(UncheckedBaseModel): keyterm: typing.Optional[str] = pydantic.Field(default=None) """ - Boost specialized terms and brands + Boost specialized terms and brands for Deepgram. """ if IS_PYDANTIC_V2: diff --git a/tests/custom/test_agentkit_agent.py b/tests/custom/test_agentkit_agent.py new file mode 100644 index 0000000..e126cfa --- /dev/null +++ b/tests/custom/test_agentkit_agent.py @@ -0,0 +1,298 @@ +from agora_agent.agentkit import ( + Agent, + AvatarConfig, + AvatarVendor, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, + SttConfig, + SttVendor, + TtsConfig, +) +import pytest + +from agora_agent.agentkit.vendors import ( + AkoolAvatar, + ElevenLabsTTS, + LiveAvatarAvatar, + OpenAI, + OpenAIRealtime, +) + + +def _parameter(config, key): + parameters = config["parameters"] + if isinstance(parameters, dict): + return parameters[key] + return getattr(parameters, key) + + +class _CopyOnlyModel: + def __init__(self, **values): + self.values = values + + def copy(self, update=None): + return _CopyOnlyModel(**{**self.values, **(update or {})}) + + +def test_generated_core_aliases_are_public(): + assert LlmConfig is not None + assert LlmStyle is not None + assert SttConfig is not None + assert SttVendor is not None + assert TtsConfig is not None + assert MllmConfig is not None + assert MllmVendor is not None + assert AvatarConfig is not None + assert AvatarVendor is not None + + +def test_model_copy_helper_supports_pydantic_v1_copy_api(): + copied = Agent._copy_model_update(_CopyOnlyModel(enable_rtm=True), {"data_channel": "rtm"}) # noqa: SLF001 + + assert copied.values == {"enable_rtm": True, "data_channel": "rtm"} + + +def test_with_audio_scenario_sets_session_parameter(): + agent = Agent(name="test").with_audio_scenario("chorus") + + assert _parameter(agent.config, "audio_scenario") == "chorus" + + +def test_with_audio_scenario_preserves_existing_parameters(): + agent = Agent(name="test", parameters={"enable_metrics": True}).with_audio_scenario( + "chorus" + ) + + assert _parameter(agent.config, "enable_metrics") is True + assert _parameter(agent.config, "audio_scenario") == "chorus" + + +def test_enable_rtm_defaults_data_channel_to_rtm(): + properties = Agent(name="test", advanced_features={"enable_rtm": True}).to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + skip_vendor_validation=True, + ) + + assert properties.parameters.data_channel == "rtm" + + +def test_enable_rtm_preserves_explicit_data_channel(): + properties = Agent( + name="test", + advanced_features={"enable_rtm": True}, + parameters={"data_channel": "datastream"}, + ).to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + skip_vendor_validation=True, + ) + + assert properties.parameters.data_channel == "datastream" + + +def test_vendor_config_takes_priority_over_agent_level_convenience_fields(): + agent = ( + Agent(name="test") + .with_llm( + OpenAI( + model="gpt-4o-mini", + greeting_message="vendor greeting", + failure_message="vendor failure", + max_history=1, + ) + ) + .with_tts(ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice", base_url="wss://api.elevenlabs.io/v1")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + + properties = agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + ) + + assert properties.llm.greeting_message == "vendor greeting" + assert properties.llm.failure_message == "vendor failure" + assert properties.llm.max_history == 1 + + +def test_avatar_sample_rate_validation_works_when_tts_added_after_avatar(): + agent = Agent(name="test").with_avatar( + LiveAvatarAvatar(api_key="avatar-key", quality="medium", agora_uid="2") + ) + + with pytest.raises(ValueError, match="24000"): + agent.with_tts( + ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice", base_url="wss://api.elevenlabs.io/v1", sample_rate=16000) + ) + + +def test_avatar_sample_rate_validation_uses_wrapper_sample_rate(): + agent = ( + Agent(name="test") + .with_avatar(AkoolAvatar(api_key="avatar-key")) + .with_tts( + ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice", base_url="wss://api.elevenlabs.io/v1", sample_rate=16000) + ) + ) + + assert agent.tts_sample_rate == 16000 + + +def test_with_mllm_removes_deprecated_advanced_features_enable_mllm(): + properties = ( + Agent( + name="test", + advanced_features={"enable_mllm": True, "enable_rtm": True}, + greeting="hello from agent", + failure_message="try again", + max_history=5, + ) + .with_mllm(OpenAIRealtime(api_key="openai-key")) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None + assert properties.mllm.enable is True + assert properties.mllm.greeting_message == "hello from agent" + assert properties.mllm.failure_message == "try again" + mllm_dump = properties.mllm.model_dump(exclude_none=True) + assert "max_history" not in mllm_dump + assert properties.advanced_features is not None + af_dump = properties.advanced_features.model_dump(exclude_none=True) + assert "enable_mllm" not in af_dump + assert af_dump.get("enable_rtm") is True + + +def test_to_properties_rejects_mllm_with_enabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + + +def test_to_properties_mllm_with_avatar_fires_before_token_generation(): + """The guard must fire before the token-generation step so callers get a + clear, actionable error even when app_id/app_certificate are empty. + """ + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + app_id="", + app_certificate="", + ) + + +def test_to_properties_rejects_mllm_with_default_enabled_avatar(): + """Avatar with no `enable` field should be treated as enabled.""" + agent = Agent(name="test").with_mllm(OpenAIRealtime(api_key="mllm-key")) + agent._avatar = { # noqa: SLF001 + "vendor": "liveavatar", + "params": { + "api_key": "avatar-key", + "quality": "high", + "agora_uid": "200", + "agora_token": "avatar-token", + }, + } + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + + +def test_to_properties_allows_mllm_with_disabled_avatar_and_no_tts(): + properties = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + enable=False, + ) + ) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None and properties.mllm.enable is True + assert properties.tts is None + assert properties.llm is None + assert properties.asr is None + assert properties.avatar is not None and properties.avatar.enable is False + + +def test_to_properties_mllm_without_tts_or_llm_succeeds(): + properties = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None and properties.mllm.enable is True + assert properties.tts is None + assert properties.llm is None + assert properties.asr is None + assert properties.avatar is None diff --git a/tests/custom/test_agentkit_session.py b/tests/custom/test_agentkit_session.py new file mode 100644 index 0000000..ebeeb63 --- /dev/null +++ b/tests/custom/test_agentkit_session.py @@ -0,0 +1,389 @@ +from types import SimpleNamespace + +import pytest + +from agora_agent.agentkit import Agent, AgentSession +from agora_agent.agentkit.vendors import ( + ElevenLabsTTS, + GenericAvatar, + LiveAvatarAvatar, + OpenAI, + OpenAIRealtime, +) +from agora_agent.agents.types.get_turns_agents_response import GetTurnsAgentsResponse + + +APP_ID = "0" * 32 +APP_CERTIFICATE = "1" * 32 + + +class _Agents: + def __init__(self): + self.calls = [] + self.start_calls = [] + + def start(self, app_id, name, properties, preset=None, pipeline_id=None, request_options=None): + self.start_calls.append((app_id, name, properties, preset, pipeline_id, request_options)) + return SimpleNamespace(agent_id="agent-1") + + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + self.calls.append((app_id, agent_id, page_index, page_size, request_options)) + is_last_page = page_index != 1 + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=2, + pagination={ + "page_index": page_index or 1, + "total_pages": 2, + "is_last_page": is_last_page, + }, + turns=[{"turn_id": float(page_index or 1)}], + ) + + +class _Client: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _Agents() + self.agent_management = object() + + +def _session(agent, warn=None): + return AgentSession( + client=_Client(), + agent=agent, + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + warn=warn, + ) + + +def test_generic_avatar_enrichment_adds_session_context_and_token(): + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + ) + ) + session = _session(agent) + + properties = session._build_start_properties( # noqa: SLF001 + {"app_id": APP_ID, "app_certificate": APP_CERTIFICATE}, + skip_vendor_validation_categories=set(), + allow_missing_vendor_categories={"tts", "llm", "asr"}, + ) + + params = properties["avatar"]["params"] + assert params["agora_appid"] == APP_ID + assert params["agora_channel"] == "room" + assert params["agora_token"] + assert params["agora_token"] != properties["token"] + + +def test_generic_avatar_empty_session_fields_are_filled(): + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + agora_appid="", + agora_channel="", + agora_token="", + ) + ) + session = _session(agent) + + properties = session._build_start_properties( # noqa: SLF001 + {"app_id": APP_ID, "app_certificate": APP_CERTIFICATE}, + skip_vendor_validation_categories=set(), + allow_missing_vendor_categories={"tts", "llm", "asr"}, + ) + + params = properties["avatar"]["params"] + assert params["agora_appid"] == APP_ID + assert params["agora_channel"] == "room" + assert params["agora_token"] + + +def test_avatar_uid_matching_agent_uid_warns(): + warnings = [] + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="1", + ) + ) + session = _session(agent, warn=warnings.append) + + session._build_start_properties( # noqa: SLF001 + {"app_id": APP_ID, "app_certificate": APP_CERTIFICATE}, + skip_vendor_validation_categories=set(), + allow_missing_vendor_categories={"tts", "llm", "asr"}, + ) + + assert any("matches agent_rtc_uid" in warning for warning in warnings) + + +def test_vendor_config_takes_priority_over_agent_level_convenience_fields(): + agent = ( + Agent(name="test") + .with_llm( + OpenAI( + model="gpt-4o-mini", + greeting_message="vendor greeting", + failure_message="vendor failure", + max_history=1, + ) + ) + .with_tts(ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice", base_url="wss://api.elevenlabs.io/v1")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties( # noqa: SLF001 + {"app_id": APP_ID, "app_certificate": APP_CERTIFICATE}, + skip_vendor_validation_categories=set(), + allow_missing_vendor_categories=set(), + ) + + assert properties["llm"]["greeting_message"] == "vendor greeting" + assert properties["llm"]["failure_message"] == "vendor failure" + assert properties["llm"]["max_history"] == 1 + + +def test_session_start_properties_applies_mllm_agent_level_defaults(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties( # noqa: SLF001 + {"app_id": APP_ID, "app_certificate": APP_CERTIFICATE}, + skip_vendor_validation_categories=set(), + allow_missing_vendor_categories=set(), + ) + + assert properties["mllm"]["greeting_message"] == "agent greeting" + assert properties["mllm"]["failure_message"] == "agent failure" + assert "max_history" not in properties["mllm"] + + +def test_session_start_properties_preserves_mllm_vendor_defaults(): + agent = ( + Agent(name="test") + .with_mllm( + OpenAIRealtime( + api_key="mllm-key", + greeting_message="vendor greeting", + failure_message="vendor failure", + ) + ) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties( # noqa: SLF001 + {"app_id": APP_ID, "app_certificate": APP_CERTIFICATE}, + skip_vendor_validation_categories=set(), + allow_missing_vendor_categories=set(), + ) + + assert properties["mllm"]["greeting_message"] == "vendor greeting" + assert properties["mllm"]["failure_message"] == "vendor failure" + assert "max_history" not in properties["mllm"] + + +def test_session_start_allows_mllm_without_tts(): + agent = Agent(name="test").with_mllm(OpenAIRealtime(api_key="mllm-key")) + session = _session(agent) + + assert session.start() == "agent-1" + + +def test_session_start_rejects_mllm_with_enabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + session = _session(agent) + + with pytest.raises(ValueError, match="cascading"): + session.start() + assert session._client.agents.start_calls == [] # noqa: SLF001 + + +def test_session_start_allows_mllm_with_disabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + enable=False, + ) + ) + ) + session = _session(agent) + + assert session.start() == "agent-1" + + +def test_avatar_sample_rate_validation_uses_serialized_vendor_keys(): + warnings = [] + agent = ( + Agent(name="test") + .with_avatar(LiveAvatarAvatar(api_key="avatar-key", quality="medium", agora_uid="2")) + .with_tts(ElevenLabsTTS(key="tts-key", model_id="eleven_flash_v2_5", voice_id="voice", base_url="wss://api.elevenlabs.io/v1", sample_rate=24000)) + ) + session = _session(agent, warn=warnings.append) + + session._validate_avatar_config() # noqa: SLF001 + + assert warnings == [] + + +def test_avatar_user_token_is_not_overwritten(): + agent = Agent(name="test").with_avatar( + LiveAvatarAvatar( + api_key="live-key", + quality="medium", + agora_uid="2", + agora_token="user-token", + ) + ) + session = _session(agent) + + properties = session._build_start_properties( # noqa: SLF001 + {"app_id": APP_ID, "app_certificate": APP_CERTIFICATE}, + skip_vendor_validation_categories=set(), + allow_missing_vendor_categories={"tts", "llm", "asr"}, + ) + + assert properties["avatar"]["params"]["agora_token"] == "user-token" + + +def test_get_turns_forwards_pagination_args(): + session = _session(Agent(name="test")) + session._agent_id = "agent-id" # noqa: SLF001 + + session.get_turns(page_index=3, page_size=25) + + assert session._client.agents.calls[-1][:4] == (APP_ID, "agent-id", 3, 25) # noqa: SLF001 + + +def test_get_all_turns_aggregates_pages(): + session = _session(Agent(name="test")) + session._agent_id = "agent-id" # noqa: SLF001 + + response = session.get_all_turns(page_size=1) + + assert [turn.turn_id for turn in response.turns] == [1.0, 2.0] + assert response.pagination.page_index == 2 + + +def test_get_all_turns_raises_when_pagination_does_not_advance(): + class _StuckAgents: + def __init__(self): + self.calls = 0 + + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + self.calls += 1 + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=2, + pagination={"page_index": 1, "is_last_page": False}, + turns=[{"turn_id": 1.0}], + ) + + class _StuckClient: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _StuckAgents() + self.agent_management = object() + + session = AgentSession( + client=_StuckClient(), + agent=Agent(name="test"), + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + ) + session._agent_id = "agent-id" # noqa: SLF001 + + with pytest.raises(RuntimeError, match="did not advance"): + session.get_all_turns(page_size=1) + + +def test_get_all_turns_raises_when_pagination_metadata_missing(): + class _NoMetaAgents: + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=1, + pagination={"is_last_page": False}, + turns=[{"turn_id": 1.0}], + ) + + class _NoMetaClient: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _NoMetaAgents() + self.agent_management = object() + + session = AgentSession( + client=_NoMetaClient(), + agent=Agent(name="test"), + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + ) + session._agent_id = "agent-id" # noqa: SLF001 + + with pytest.raises(RuntimeError, match="cannot continue"): + session.get_all_turns(page_size=1) diff --git a/tests/custom/test_agentkit_vendors.py b/tests/custom/test_agentkit_vendors.py new file mode 100644 index 0000000..e84b11e --- /dev/null +++ b/tests/custom/test_agentkit_vendors.py @@ -0,0 +1,116 @@ +import pytest +from pydantic import ValidationError + +from agora_agent.agentkit import LlmGreetingConfigs +from agora_agent.agentkit.vendors import GenericAvatar, OpenAI, OpenAIRealtime, XaiGrok + + +def test_xai_grok_serializes_v27_shape_without_style(): + config = XaiGrok( + api_key="xai-key", + voice="eve", + language="en", + sample_rate=24000, + output_modalities=["audio", "text"], + params={"temperature": 0.2}, + ).to_config() + + assert config["vendor"] == "xai" + assert config["url"] == "wss://api.x.ai/v1/realtime" + assert config["api_key"] == "xai-key" + assert config["params"] == { + "temperature": 0.2, + "voice": "eve", + "language": "en", + "sample_rate": 24000, + } + assert config["output_modalities"] == ["audio", "text"] + assert "style" not in config + + +def test_xai_grok_emits_params_even_when_empty(): + assert XaiGrok(api_key="xai-key").to_config()["params"] == {} + + + +def test_mllm_rejects_fields_not_in_core_contract(): + with pytest.raises(ValidationError): + OpenAIRealtime(api_key="openai-key", predefined_tools=["_publish_message"]) + + with pytest.raises(ValidationError): + XaiGrok(api_key="xai-key", max_history=10) + + +def test_generic_avatar_omits_session_enriched_fields_when_unset(): + config = GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + ).to_config() + + assert config == { + "enable": True, + "vendor": "generic", + "params": { + "api_key": "avatar-key", + "api_base_url": "https://avatar.example.com", + "avatar_id": "avatar-1", + "agora_uid": "2", + }, + } + + +def test_vertex_ai_explicit_fields_override_additional_params(): + from agora_agent.agentkit.vendors import VertexAI + + config = VertexAI( + model="explicit-model", + project_id="explicit-project", + location="explicit-region", + adc_credentials_string="{}", + additional_params={ + "model": "should-be-overridden", + "project_id": "should-be-overridden", + "location": "should-be-overridden", + "adc_credentials_string": "should-be-overridden", + "extra_key": "kept", + }, + ).to_config() + + assert config["vendor"] == "vertexai" + # routing fields are top-level, not inside params + assert config["project_id"] == "explicit-project" + assert config["location"] == "explicit-region" + assert config["adc_credentials_string"] == "{}" + # model and extra_key live inside params + assert config["params"]["model"] == "explicit-model" + assert config["params"]["extra_key"] == "kept" + + +def test_gemini_live_explicit_fields_override_additional_params(): + from agora_agent.agentkit.vendors import GeminiLive + + config = GeminiLive( + api_key="key", + model="explicit-model", + additional_params={ + "model": "should-be-overridden", + "extra_key": "kept", + }, + ).to_config() + + assert config["params"]["model"] == "explicit-model" + assert config["params"]["extra_key"] == "kept" + + +def test_llm_greeting_configs_interruptable_serializes(): + config = OpenAI( + api_key="openai-key", + model="gpt-4o", + base_url="https://api.openai.com/v1/chat/completions", + greeting_configs={"mode": "single_first", "interruptable": False}, + ).to_config() + + assert config["greeting_configs"]["mode"] == "single_first" + assert config["greeting_configs"]["interruptable"] is False diff --git a/tests/custom/test_docs_snake_case.py b/tests/custom/test_docs_snake_case.py new file mode 100644 index 0000000..ee08043 --- /dev/null +++ b/tests/custom/test_docs_snake_case.py @@ -0,0 +1,94 @@ +from __future__ import annotations + +import re +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[2] + +SCANNED_MARKDOWN = [ + ROOT / "README.md", + *sorted((ROOT / "docs").rglob("*.md")), +] + +SKIP_LANGS = { + "bash", + "console", + "go", + "javascript", + "js", + "json", + "shell", + "sh", + "text", + "ts", + "typescript", + "yaml", + "yml", +} + +PYTHON_HINTS = ( + "from agora_agent", + "import agora_agent", + "Agent(", + "OpenAI(", + "OpenAITTS(", + "OpenAISTT(", + "MiniMaxTTS(", + "DeepgramSTT(", + "GoogleTTS(", + "RimeTTS(", + "VertexAI(", + "VertexAILLM(", +) + +BLOCKED_TERMS = { + "apiKey": "api_key", + "baseUrl": "base_url", + "modelId": "model_id", + "voiceId": "voice_id", + "groupId": "group_id", + "projectId": "project_id", + "resourceName": "resource_name", + "deploymentName": "deployment_name", + "inputAudioTranscription": "input_audio_transcription", + "greetingMessage": "greeting_message", + "failureMessage": "failure_message", + "turnDetection": "turn_detection", + "adcCredentialsString": "adc_credentials_string", + "sampleRate": "sample_rate", + "targetLanguageCode": "target_language_code", +} + +FENCE_RE = re.compile(r"^```(?P[^\n`]*)\n(?P.*?)(?:^```)", re.MULTILINE | re.DOTALL) + + +def _should_scan(lang: str, body: str) -> bool: + lang_parts = lang.strip().split(maxsplit=1) + normalized = lang_parts[0].lower() if lang_parts else "" + if normalized in {"python", "py"}: + return True + if normalized in SKIP_LANGS: + return False + if normalized: + return False + return any(hint in body for hint in PYTHON_HINTS) + + +def test_python_docs_examples_use_snake_case_kwargs() -> None: + failures: list[str] = [] + + for path in SCANNED_MARKDOWN: + text = path.read_text() + for match in FENCE_RE.finditer(text): + body = match.group("body") + if not _should_scan(match.group("lang"), body): + continue + + line_offset = text[: match.start("body")].count("\n") + for term, replacement in BLOCKED_TERMS.items(): + for term_match in re.finditer(rf"\b{re.escape(term)}\b", body): + line = line_offset + body[: term_match.start()].count("\n") + 1 + failures.append(f"{path.relative_to(ROOT)}:{line}: use {replacement} instead of {term}") + + assert not failures, "CamelCase kwargs found in Python docs examples:\n" + "\n".join(failures) diff --git a/tests/custom/test_llm_vendors.py b/tests/custom/test_llm_vendors.py index 2861e45..0d07cf4 100644 --- a/tests/custom/test_llm_vendors.py +++ b/tests/custom/test_llm_vendors.py @@ -63,8 +63,10 @@ def test_vertex_ai_llm_includes_project_routing() -> None: assert config["api_key"] == "vertex-token" assert config["style"] == "gemini" assert config["params"]["model"] == "gemini-2.0-flash" - assert config["params"]["project_id"] == "project" - assert config["params"]["location"] == "us-central1" + assert "project" in config["url"] + assert "us-central1" in config["url"] + assert "project_id" not in config.get("params", {}) + assert "location" not in config.get("params", {}) def test_amazon_bedrock_serializes_as_bedrock_style() -> None: diff --git a/tests/custom/test_request_body.py b/tests/custom/test_request_body.py new file mode 100644 index 0000000..52c8875 --- /dev/null +++ b/tests/custom/test_request_body.py @@ -0,0 +1,1207 @@ +""" +test_request_body.py — Integration-level tests for request body shape. + +Covers: + Scenario 1 — BYOK pipeline (full properties shape) + Scenario 2 — Preset-backed pipeline (managed vendors, field-stripping) + Scenario 3 — LLM config fields win over agent-level convenience fields + Scenario 4 — VertexAILLM URL construction + Scenario 5 — OpenAISTT params (5a model, 5b prompt, 5c language, 5d defaults) + Scenario 6 — Mixed preset + BYOK (6a ASR preset + BYOK LLM/TTS, 6b TTS preset + BYOK LLM/ASR) + Scenario 7 — Pipeline ID (7b shape with BYOK LLM, 7c empty properties) + Scenario 8 — MLLM mode (8a start call, 8b/8c agent-level greeting wins/vendor wins) + BYOK vendor coverage matrix (all STT, LLM, TTS vendors) + Preset coverage matrix (all inferred presets) +""" + +from __future__ import annotations + +from types import SimpleNamespace + +import pytest + +from agora_agent import ( + Agent, + AmazonBedrock, + AmazonSTT, + AmazonTTS, + Anthropic, + AresSTT, + AssemblyAISTT, + AzureOpenAI, + CartesiaTTS, + CustomLLM, + DeepgramSTT, + DeepgramTTS, + Dify, + ElevenLabsTTS, + FishAudioTTS, + Gemini, + GeminiLive, + GoogleSTT, + GoogleTTS, + Groq, + HumeAITTS, + MicrosoftSTT, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAI, + OpenAIRealtime, + OpenAISTT, + OpenAITTS, + RimeTTS, + SarvamSTT, + SarvamTTS, + SpeechmaticsSTT, + VertexAI, + VertexAILLM, + XaiGrok, +) +from agora_agent.agentkit import AgentSession +from agora_agent.agentkit.presets import resolve_session_presets + + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + +APP_ID = "0" * 32 +APP_CERTIFICATE = "1" * 32 + + +def dump(value): + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + +def dump_wire(value): + if hasattr(value, "dict"): + return value.dict(by_alias=True) + return dump(value) + + +# --------------------------------------------------------------------------- +# Pattern 1: FakeAgentsClient — captures the full start() call +# --------------------------------------------------------------------------- + + +class StartResponse: + agent_id = "agent-id" + + +class FakeAgentsClient: + def __init__(self): + self.calls = [] + + def start(self, appid, **kwargs): + self.calls.append({"appid": appid, **kwargs}) + return StartResponse() + + +class FakeAsyncAgentsClient: + def __init__(self): + self.calls = [] + + async def start(self, appid, **kwargs): + self.calls.append({"appid": appid, **kwargs}) + return StartResponse() + + +class FakeClient: + app_id = "appid" + app_certificate = None + + def __init__(self, agents): + self.agents = agents + + +def start_session(agent, **session_kwargs): + """Start agent session via FakeAgentsClient and return the captured call dict.""" + agents = FakeAgentsClient() + client = FakeClient(agents) + agent.create_session( + client=client, + channel="channel", + token="test-token", + agent_uid="1", + remote_uids=["100"], + **session_kwargs, + ).start() + return agents.calls[0] + + +async def start_async_session(agent, **session_kwargs): + """Start async agent session via FakeAsyncAgentsClient and return the captured call dict.""" + agents = FakeAsyncAgentsClient() + client = FakeClient(agents) + await agent.create_async_session( + client=client, + channel="channel", + token="test-token", + agent_uid="1", + remote_uids=["100"], + **session_kwargs, + ).start() + return agents.calls[0] + + +def full_agent_with_tts(tts): + return ( + Agent(name="support") + .with_stt(DeepgramSTT(api_key="dg-key", model="nova-2", language="en")) + .with_llm( + OpenAI( + api_key="openai-key", + base_url="https://api.openai.com/v1/chat/completions", + model="gpt-4o", + ) + ) + .with_tts(tts) + ) + + +def invalid_google_tts_properties(): + return { + "channel": "channel", + "token": "test-token", + "agent_rtc_uid": "1", + "remote_rtc_uids": ["100"], + "tts": { + "vendor": "google", + "params": { + "credentials": "{}", + }, + }, + } + + +# --------------------------------------------------------------------------- +# Pattern 2: _build_start_properties — properties-only shape +# --------------------------------------------------------------------------- + + +class _Agents: + def start(self, app_id, name, properties, preset=None, pipeline_id=None, request_options=None): + return SimpleNamespace(agent_id="agent-1") + + +class _Client: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _Agents() + self.agent_management = object() + + +def build_properties(agent, allow_missing=None): + session = AgentSession( + client=_Client(), + agent=agent, + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="channel", + agent_uid="1", + remote_uids=["100"], + ) + return session._build_start_properties( # noqa: SLF001 + {"app_id": APP_ID, "app_certificate": APP_CERTIFICATE}, + skip_vendor_validation_categories=set(), + allow_missing_vendor_categories=allow_missing or set(), + ) + + +def test_request_properties_validation_raises_without_preset_or_pipeline() -> None: + with pytest.raises(Exception): + AgentSession._request_properties_for_start( # noqa: SLF001 + invalid_google_tts_properties(), + resolved_preset=None, + pipeline_id=None, + ) + + +def test_request_properties_validation_fallback_allows_preset_partial_config() -> None: + properties = invalid_google_tts_properties() + + request_properties = AgentSession._request_properties_for_start( # noqa: SLF001 + properties, + resolved_preset="openai_tts_1", + pipeline_id=None, + ) + + assert request_properties is properties + + +def test_request_properties_validation_fallback_is_limited_to_preset_category() -> None: + with pytest.raises(Exception): + AgentSession._request_properties_for_start( # noqa: SLF001 + invalid_google_tts_properties(), + resolved_preset="openai_gpt_4o_mini", + pipeline_id=None, + ) + + +def test_request_properties_validation_fallback_allows_pipeline_partial_config() -> None: + properties = invalid_google_tts_properties() + + request_properties = AgentSession._request_properties_for_start( # noqa: SLF001 + properties, + resolved_preset=None, + pipeline_id="pipeline-id", + ) + + assert request_properties is properties + + +# =========================================================================== +# Scenario 1 — BYOK pipeline (full properties shape) +# =========================================================================== + + +def test_byok_pipeline_full_properties_shape() -> None: + """OpenAI BYOK LLM + Deepgram BYOK STT + ElevenLabs TTS produces expected properties.""" + agent = ( + Agent(name="support") + .with_stt(DeepgramSTT(api_key="dg-key", model="nova-2", language="en")) + .with_llm( + OpenAI( + api_key="openai-key", + base_url="https://api.openai.com/v1/chat/completions", + model="gpt-4o", + ) + ) + .with_tts( + ElevenLabsTTS( + key="el-key", + model_id="eleven_flash_v2_5", + voice_id="voice123", + base_url="wss://api.elevenlabs.io/v1", + ) + ) + ) + + props = build_properties(agent) + + # RTC routing + assert props["channel"] == "channel" + assert props["agent_rtc_uid"] == "1" + assert props["remote_rtc_uids"] == ["100"] + + # ASR + asr = props["asr"] + assert asr["vendor"] == "deepgram" + assert asr["params"]["key"] == "dg-key" + assert asr["params"]["model"] == "nova-2" + assert asr["params"]["language"] == "en" + + # LLM + llm = props["llm"] + assert llm["api_key"] == "openai-key" + assert llm["style"] == "openai" + assert llm["params"]["model"] == "gpt-4o" + + # TTS + tts = props["tts"] + assert tts["vendor"] == "elevenlabs" + assert tts["params"]["key"] == "el-key" + assert tts["params"]["model_id"] == "eleven_flash_v2_5" + assert tts["params"]["voice_id"] == "voice123" + + +# =========================================================================== +# Scenario 2 — Preset-backed pipeline (full start request, field stripping) +# =========================================================================== + + +def test_managed_llm_and_tts_produce_preset_and_strip_fields() -> None: + """Managed OpenAI LLM + MiniMax TTS generate preset string and strip BYOK fields.""" + agent = ( + Agent(name="support") + .with_llm(OpenAI(model="gpt-4o-mini")) + .with_tts(MiniMaxTTS(model="speech_2_8_turbo", voice_id="English_captivating_female1")) + ) + + call = start_session(agent) + assert "openai_gpt_4o_mini" in (call["preset"] or "") + assert "minimax_speech_2_8_turbo" in (call["preset"] or "") + + properties = dump(call["properties"]) + # api_key and url stripped for managed LLM + assert "api_key" not in properties.get("llm", {}) + # vendor retained for TTS + assert properties["tts"]["vendor"] == "minimax" + # BYOK key stripped for managed TTS + assert "key" not in properties["tts"].get("params", {}) + + +# =========================================================================== +# Scenario 3 — LLM config wins over agent-level fields +# =========================================================================== + + +def test_llm_config_greeting_wins_over_agent_level_greeting() -> None: + """When OpenAI vendor sets greeting_message it overrides agent.with_greeting().""" + agent = ( + Agent(name="support") + .with_llm( + OpenAI( + api_key="openai-key", + base_url="https://api.openai.com/v1/chat/completions", + model="gpt-4o", + greeting_message="vendor greeting", + ) + ) + .with_greeting("agent greeting") + ) + + props = build_properties(agent, allow_missing={"asr", "tts"}) + assert props["llm"]["greeting_message"] == "vendor greeting" + + +# =========================================================================== +# Scenario 4 — VertexAILLM URL construction +# =========================================================================== + + +def test_vertex_ai_llm_constructs_correct_url_and_params() -> None: + """VertexAILLM auto-constructs the aiplatform URL; project_id/location are URL-encoded, not in params.""" + agent = Agent(name="support").with_llm( + VertexAILLM( + api_key="vertex-token", + model="gemini-2.0-flash", + project_id="my-project", + location="us-central1", + ) + ) + + props = build_properties(agent, allow_missing={"asr", "tts"}) + llm = props["llm"] + + expected_url_fragment = "us-central1-aiplatform.googleapis.com" + assert expected_url_fragment in llm["url"] + assert "my-project" in llm["url"] + assert llm["style"] == "gemini" + assert llm["params"]["model"] == "gemini-2.0-flash" + assert "project_id" not in llm["params"] + assert "location" not in llm["params"] + + +# =========================================================================== +# Scenario 5 — OpenAISTT params +# =========================================================================== + + +def test_openai_stt_5a_model_param_is_sent() -> None: + """5a: OpenAISTT model appears inside input_audio_transcription.model.""" + agent = Agent(name="support").with_stt( + OpenAISTT( + api_key="oai-key", + model="gpt-4o-mini-transcribe", + prompt="transcribe clearly", + language="en", + ) + ) + + props = build_properties(agent, allow_missing={"llm", "tts"}) + transcription = props["asr"]["params"]["input_audio_transcription"] + assert transcription["model"] == "gpt-4o-mini-transcribe" + + +def test_openai_stt_5b_prompt_param_is_sent() -> None: + """5b: OpenAISTT prompt appears inside input_audio_transcription.prompt.""" + agent = Agent(name="support").with_stt( + OpenAISTT( + api_key="oai-key", + model="gpt-4o-mini-transcribe", + prompt="use proper nouns", + language="en", + ) + ) + + props = build_properties(agent, allow_missing={"llm", "tts"}) + transcription = props["asr"]["params"]["input_audio_transcription"] + assert transcription["prompt"] == "use proper nouns" + + +def test_openai_stt_5c_language_param_is_sent() -> None: + """5c: OpenAISTT language appears inside input_audio_transcription.language.""" + agent = Agent(name="support").with_stt( + OpenAISTT( + api_key="oai-key", + model="gpt-4o-mini-transcribe", + prompt="some prompt", + language="fr", + ) + ) + + props = build_properties(agent, allow_missing={"llm", "tts"}) + transcription = props["asr"]["params"]["input_audio_transcription"] + assert transcription["language"] == "fr" + + +def test_openai_stt_5d_api_key_is_top_level_in_params() -> None: + """5d: OpenAISTT api_key is a top-level key inside asr.params (not inside input_audio_transcription).""" + agent = Agent(name="support").with_stt( + OpenAISTT( + api_key="oai-key", + model="gpt-4o-mini-transcribe", + prompt="some prompt", + language="en", + ) + ) + + props = build_properties(agent, allow_missing={"llm", "tts"}) + asr_params = props["asr"]["params"] + assert asr_params["api_key"] == "oai-key" + assert "api_key" not in asr_params.get("input_audio_transcription", {}) + + +# =========================================================================== +# Scenario 6 — Mixed preset + BYOK +# =========================================================================== + + +def test_6a_asr_preset_with_byok_llm_and_tts() -> None: + """6a: Managed Deepgram ASR preset + BYOK LLM + BYOK TTS.""" + agent = ( + Agent(name="support") + .with_stt(DeepgramSTT(model="nova-3", language="en-US")) + .with_llm( + OpenAI( + api_key="openai-key", + base_url="https://api.openai.com/v1/chat/completions", + model="gpt-4o", + ) + ) + .with_tts( + ElevenLabsTTS( + key="el-key", + model_id="eleven_flash_v2_5", + voice_id="voice123", + base_url="wss://api.elevenlabs.io/v1", + ) + ) + ) + + call = start_session(agent) + preset = call.get("preset") or "" + assert "deepgram_nova_3" in preset + # No LLM or TTS preset inferred + assert "openai_gpt" not in preset + assert "openai_tts" not in preset + + properties = dump(call["properties"]) + assert properties["llm"]["api_key"] == "openai-key" + assert properties["tts"]["vendor"] == "elevenlabs" + + +def test_6b_tts_preset_with_byok_llm_and_asr() -> None: + """6b: Managed OpenAITTS preset + BYOK LLM + BYOK Deepgram ASR.""" + agent = ( + Agent(name="support") + .with_stt(DeepgramSTT(api_key="dg-key", model="nova-2", language="en-US")) + .with_llm( + OpenAI( + api_key="openai-key", + base_url="https://api.openai.com/v1/chat/completions", + model="gpt-4o", + ) + ) + .with_tts(OpenAITTS(voice="alloy")) + ) + + call = start_session(agent) + preset = call.get("preset") or "" + assert "openai_tts_1" in preset + assert "deepgram_nova_2" not in preset # BYOK key present — no ASR preset inferred + + properties = dump(call["properties"]) + # BYOK ASR: key and model both retained (nothing stripped for BYOK path) + assert properties["asr"]["params"]["key"] == "dg-key" + assert properties["asr"]["params"]["model"] == "nova-2" + # BYOK LLM key retained + assert properties["llm"]["api_key"] == "openai-key" + # TTS api_key stripped (managed) + assert "api_key" not in properties["tts"].get("params", {}) + + +# =========================================================================== +# Scenario 7 — Pipeline ID +# =========================================================================== + + +def test_7b_pipeline_id_with_byok_llm_override() -> None: + """7b: pipeline_id present, single LLM override, ASR/TTS absent from properties.""" + agent = Agent(name="support", pipeline_id="studio-pipeline").with_llm( + OpenAI( + api_key="openai-key", + base_url="https://api.openai.com/v1/chat/completions", + model="gpt-4o", + ) + ) + + call = start_session(agent) + assert call["pipeline_id"] == "studio-pipeline" + properties = dump(call["properties"]) + assert properties["llm"]["api_key"] == "openai-key" + assert "asr" not in properties + assert "tts" not in properties + + +def test_7c_pipeline_id_empty_properties_no_vendors() -> None: + """7c: pipeline_id alone — no vendor keys in properties.""" + agent = Agent(name="support", pipeline_id="studio-pipeline") + + call = start_session(agent) + assert call["pipeline_id"] == "studio-pipeline" + properties = dump(call["properties"]) + assert "asr" not in properties + assert "llm" not in properties + assert "tts" not in properties + + +def test_7d_pipeline_id_with_byok_tts_only() -> None: + """7d: pipeline_id present, TTS-only BYOK override — ASR and LLM absent from properties.""" + agent = Agent(name="support", pipeline_id="studio-pipeline").with_tts( + ElevenLabsTTS( + key="el-key", + model_id="eleven_flash_v2_5", + voice_id="some-voice", + base_url="wss://api.elevenlabs.io/v1", + ) + ) + + call = start_session(agent) + assert call["pipeline_id"] == "studio-pipeline" + properties = dump(call["properties"]) + assert "asr" not in properties + assert "llm" not in properties + assert properties["tts"]["vendor"] == "elevenlabs" + assert properties["tts"]["params"]["key"] == "el-key" + + +def test_7e_pipeline_id_with_byok_asr_and_tts() -> None: + """7e: pipeline_id present, ASR+TTS BYOK overrides — LLM absent from properties.""" + agent = ( + Agent(name="support", pipeline_id="studio-pipeline") + .with_stt(DeepgramSTT(api_key="dg-key", language="en")) + .with_tts( + ElevenLabsTTS( + key="el-key", + model_id="eleven_flash_v2_5", + voice_id="some-voice", + base_url="wss://api.elevenlabs.io/v1", + ) + ) + ) + + call = start_session(agent) + assert call["pipeline_id"] == "studio-pipeline" + properties = dump(call["properties"]) + assert "llm" not in properties + assert properties["asr"]["vendor"] == "deepgram" + assert properties["tts"]["vendor"] == "elevenlabs" + + +# =========================================================================== +# Scenario 8 — MLLM mode +# =========================================================================== + + +def test_8a_mllm_start_call_has_correct_top_level_vendor() -> None: + """8a: OpenAIRealtime MLLM session – start call contains mllm with vendor=openai.""" + agent = Agent(name="support").with_mllm( + OpenAIRealtime(api_key="realtime-key", model="gpt-4o-realtime-preview", voice="coral") + ) + + call = start_session(agent) + properties = dump(call["properties"]) + assert "mllm" in properties + mllm = properties["mllm"] + assert mllm["vendor"] == "openai" + assert mllm["api_key"] == "realtime-key" + assert mllm["params"]["model"] == "gpt-4o-realtime-preview" + assert mllm["params"]["voice"] == "coral" + + +def test_8b_agent_greeting_fills_mllm_when_vendor_omits_it() -> None: + """8b: agent.with_greeting() fills mllm.greeting_message when vendor does not set it.""" + agent = ( + Agent(name="support") + .with_mllm(OpenAIRealtime(api_key="realtime-key")) + .with_greeting("hello from agent") + ) + + props = build_properties(agent) + assert props["mllm"]["greeting_message"] == "hello from agent" + + +def test_8c_vendor_greeting_wins_over_agent_level_greeting_in_mllm() -> None: + """8c: Vendor-level greeting_message wins over agent.with_greeting() in MLLM mode.""" + agent = ( + Agent(name="support") + .with_mllm( + OpenAIRealtime( + api_key="realtime-key", + greeting_message="vendor greeting", + ) + ) + .with_greeting("agent greeting") + ) + + props = build_properties(agent) + assert props["mllm"]["greeting_message"] == "vendor greeting" + + +# =========================================================================== +# BYOK Vendor Coverage Matrix — STT vendors +# =========================================================================== + + +def test_byok_deepgram_stt_params() -> None: + agent = Agent(name="t").with_stt( + DeepgramSTT(api_key="dg-key", model="nova-2", language="en") + ) + props = build_properties(agent, allow_missing={"llm", "tts"}) + assert props["asr"]["vendor"] == "deepgram" + assert props["asr"]["params"]["key"] == "dg-key" + assert props["asr"]["params"]["model"] == "nova-2" + assert props["asr"]["params"]["language"] == "en" + + +def test_byok_microsoft_stt_params() -> None: + agent = Agent(name="t").with_stt( + MicrosoftSTT(key="ms-key", region="eastus", language="en-US") + ) + props = build_properties(agent, allow_missing={"llm", "tts"}) + assert props["asr"]["vendor"] == "microsoft" + assert props["asr"]["params"]["key"] == "ms-key" + assert props["asr"]["params"]["region"] == "eastus" + assert props["asr"]["params"]["language"] == "en-US" + + +def test_byok_google_stt_params() -> None: + agent = Agent(name="t").with_stt( + GoogleSTT( + project_id="my-project", + location="global", + adc_credentials_string="{}", + language="en-US", + model="long", + ) + ) + props = build_properties(agent, allow_missing={"llm", "tts"}) + assert props["asr"]["vendor"] == "google" + p = props["asr"]["params"] + assert p["project_id"] == "my-project" + assert p["location"] == "global" + assert p["language"] == "en-US" + assert p["model"] == "long" + + +def test_byok_amazon_stt_params() -> None: + agent = Agent(name="t").with_stt( + AmazonSTT(access_key="ak", secret_key="sk", region="us-east-1", language="en-US") + ) + props = build_properties(agent, allow_missing={"llm", "tts"}) + assert props["asr"]["vendor"] == "amazon" + p = props["asr"]["params"] + assert p["access_key_id"] == "ak" + assert p["secret_access_key"] == "sk" + assert p["region"] == "us-east-1" + assert p["language_code"] == "en-US" + + +def test_byok_assemblyai_stt_params() -> None: + agent = Agent(name="t").with_stt( + AssemblyAISTT(api_key="assembly-key", language="en-US") + ) + props = build_properties(agent, allow_missing={"llm", "tts"}) + assert props["asr"]["vendor"] == "assemblyai" + assert props["asr"]["params"]["api_key"] == "assembly-key" + assert props["asr"]["params"]["language"] == "en-US" + + +def test_byok_ares_stt_no_params() -> None: + agent = Agent(name="t").with_stt(AresSTT()) + props = build_properties(agent, allow_missing={"llm", "tts"}) + assert props["asr"]["vendor"] == "ares" + assert "params" not in props["asr"] + + +def test_byok_speechmatics_stt_params() -> None: + agent = Agent(name="t").with_stt( + SpeechmaticsSTT(api_key="sm-key", language="en") + ) + props = build_properties(agent, allow_missing={"llm", "tts"}) + assert props["asr"]["vendor"] == "speechmatics" + assert props["asr"]["params"]["api_key"] == "sm-key" + assert props["asr"]["params"]["language"] == "en" + + +def test_byok_sarvam_stt_params() -> None: + agent = Agent(name="t").with_stt( + SarvamSTT(api_key="sarvam-key", language="en-IN") + ) + props = build_properties(agent, allow_missing={"llm", "tts"}) + assert props["asr"]["vendor"] == "sarvam" + assert props["asr"]["params"]["api_key"] == "sarvam-key" + assert props["asr"]["params"]["language"] == "en-IN" + + +# --------------------------------------------------------------------------- +# BYOK Vendor Coverage Matrix — LLM vendors +# --------------------------------------------------------------------------- + + +def test_byok_openai_llm_params() -> None: + agent = Agent(name="t").with_llm( + OpenAI( + api_key="openai-key", + base_url="https://api.openai.com/v1/chat/completions", + model="gpt-4o", + ) + ) + props = build_properties(agent, allow_missing={"asr", "tts"}) + assert props["llm"]["api_key"] == "openai-key" + assert props["llm"]["style"] == "openai" + assert props["llm"]["params"]["model"] == "gpt-4o" + + +def test_byok_azure_openai_llm_params() -> None: + agent = Agent(name="t").with_llm( + AzureOpenAI( + api_key="azure-key", + endpoint="https://example.openai.azure.com", + deployment_name="my-deployment", + model="gpt-4o", + ) + ) + props = build_properties(agent, allow_missing={"asr", "tts"}) + assert props["llm"]["api_key"] == "azure-key" + assert props["llm"]["style"] == "openai" + assert props["llm"]["params"]["model"] == "gpt-4o" + + +def test_byok_anthropic_llm_params() -> None: + agent = Agent(name="t").with_llm( + Anthropic( + api_key="anthropic-key", + model="claude-3-5-sonnet-20241022", + url="https://api.anthropic.com/v1/messages", + headers={"anthropic-version": "2023-06-01"}, + max_tokens=1024, + ) + ) + props = build_properties(agent, allow_missing={"asr", "tts"}) + assert props["llm"]["api_key"] == "anthropic-key" + assert props["llm"]["style"] == "anthropic" + assert props["llm"]["headers"]["anthropic-version"] == "2023-06-01" + assert props["llm"]["params"]["max_tokens"] == 1024 + + +def test_byok_gemini_llm_params() -> None: + agent = Agent(name="t").with_llm( + Gemini(api_key="gemini-key", model="gemini-2.0-flash") + ) + props = build_properties(agent, allow_missing={"asr", "tts"}) + assert props["llm"]["api_key"] == "gemini-key" + assert props["llm"]["style"] == "gemini" + assert props["llm"]["params"]["model"] == "gemini-2.0-flash" + + +def test_byok_groq_llm_params() -> None: + agent = Agent(name="t").with_llm( + Groq( + api_key="groq-key", + model="llama-3.3-70b-versatile", + base_url="https://api.groq.com/openai/v1/chat/completions", + ) + ) + props = build_properties(agent, allow_missing={"asr", "tts"}) + assert props["llm"]["api_key"] == "groq-key" + assert props["llm"]["style"] == "openai" + assert props["llm"]["params"]["model"] == "llama-3.3-70b-versatile" + + +def test_byok_custom_llm_params() -> None: + agent = Agent(name="t").with_llm( + CustomLLM( + api_key="custom-key", + model="my-model", + base_url="https://llm.example.com/chat", + ) + ) + props = build_properties(agent, allow_missing={"asr", "tts"}) + assert props["llm"]["api_key"] == "custom-key" + assert props["llm"]["vendor"] == "custom" + assert props["llm"]["style"] == "openai" + + +def test_byok_amazon_bedrock_llm_params() -> None: + agent = Agent(name="t").with_llm( + AmazonBedrock( + access_key="aws-access", + secret_key="aws-secret", + region="us-east-1", + model="anthropic.claude-3-5-sonnet-20241022-v2:0", + ) + ) + props = build_properties(agent, allow_missing={"asr", "tts"}) + assert props["llm"]["style"] == "bedrock" + assert props["llm"]["access_key"] == "aws-access" + assert "us-east-1" in props["llm"]["url"] + + +def test_byok_dify_llm_params() -> None: + agent = Agent(name="t").with_llm( + Dify( + api_key="dify-key", + url="https://api.dify.ai/v1/chat-messages", + model="default", + ) + ) + props = build_properties(agent, allow_missing={"asr", "tts"}) + assert props["llm"]["api_key"] == "dify-key" + assert props["llm"]["style"] == "dify" + assert props["llm"]["params"]["model"] == "default" + + +# --------------------------------------------------------------------------- +# BYOK Vendor Coverage Matrix — TTS vendors +# --------------------------------------------------------------------------- + + +def test_byok_elevenlabs_tts_params() -> None: + agent = Agent(name="t").with_tts( + ElevenLabsTTS( + key="el-key", + model_id="eleven_flash_v2_5", + voice_id="voice", + base_url="wss://api.elevenlabs.io/v1", + ) + ) + props = build_properties(agent, allow_missing={"asr", "llm"}) + assert props["tts"]["vendor"] == "elevenlabs" + assert props["tts"]["params"]["key"] == "el-key" + assert props["tts"]["params"]["model_id"] == "eleven_flash_v2_5" + assert props["tts"]["params"]["voice_id"] == "voice" + + +def test_byok_microsoft_tts_params() -> None: + agent = Agent(name="t").with_tts( + MicrosoftTTS(key="ms-key", region="eastus", voice_name="en-US-JennyNeural") + ) + props = build_properties(agent, allow_missing={"asr", "llm"}) + assert props["tts"]["vendor"] == "microsoft" + assert props["tts"]["params"]["key"] == "ms-key" + assert props["tts"]["params"]["region"] == "eastus" + assert props["tts"]["params"]["voice_name"] == "en-US-JennyNeural" + + +def test_byok_openai_tts_params() -> None: + agent = Agent(name="t").with_tts( + OpenAITTS( + api_key="oai-tts-key", + voice="alloy", + model="tts-1-hd", + base_url="https://api.openai.com/v1", + ) + ) + props = build_properties(agent, allow_missing={"asr", "llm"}) + assert props["tts"]["vendor"] == "openai" + assert props["tts"]["params"]["api_key"] == "oai-tts-key" + assert props["tts"]["params"]["model"] == "tts-1-hd" + assert props["tts"]["params"]["voice"] == "alloy" + + +def test_byok_cartesia_tts_params() -> None: + agent = Agent(name="t").with_tts( + CartesiaTTS(api_key="cartesia-key", voice_id="voice", model_id="sonic-2", sample_rate=24000) + ) + props = build_properties(agent, allow_missing={"asr", "llm"}) + assert props["tts"]["vendor"] == "cartesia" + p = props["tts"]["params"] + assert p["api_key"] == "cartesia-key" + assert p["voice"] == {"mode": "id", "id": "voice"} + + +def test_byok_google_tts_params() -> None: + config = GoogleTTS(key="{}", voice_name="en-US-JennyNeural", language_code="en-US", sample_rate_hertz=24000).to_config() + assert config["vendor"] == "google" + p = config["params"] + assert p["credentials"] == "{}" + assert p["VoiceSelectionParams"]["name"] == "en-US-JennyNeural" + assert p["VoiceSelectionParams"]["language_code"] == "en-US" + + +def test_byok_amazon_tts_params() -> None: + agent = Agent(name="t").with_tts( + AmazonTTS(access_key="access", secret_key="secret", region="us-east-1", voice_id="Joanna", engine="neural") + ) + props = build_properties(agent, allow_missing={"asr", "llm"}) + assert props["tts"]["vendor"] == "amazon" + p = props["tts"]["params"] + assert p["aws_access_key_id"] == "access" + assert p["aws_secret_access_key"] == "secret" + assert p["voice"] == "Joanna" + + +def test_byok_deepgram_tts_params() -> None: + agent = Agent(name="t").with_tts( + DeepgramTTS(api_key="dg-tts-key", model="aura-2-thalia-en", base_url="wss://api.deepgram.com/v1/speak", sample_rate=24000) + ) + props = build_properties(agent, allow_missing={"asr", "llm"}) + assert props["tts"]["vendor"] == "deepgram" + assert props["tts"]["params"]["api_key"] == "dg-tts-key" + assert props["tts"]["params"]["model"] == "aura-2-thalia-en" + + +def test_byok_humeai_tts_params() -> None: + agent = Agent(name="t").with_tts( + HumeAITTS(key="hume-key", voice_id="voice", provider="CUSTOM_VOICE") + ) + props = build_properties(agent, allow_missing={"asr", "llm"}) + assert props["tts"]["vendor"] == "humeai" + assert props["tts"]["params"]["key"] == "hume-key" + assert props["tts"]["params"]["voice_id"] == "voice" + + +def test_byok_rime_tts_params() -> None: + config = RimeTTS(key="rime-key", speaker="speaker", model_id="mist").to_config() + assert config["vendor"] == "rime" + assert config["params"]["api_key"] == "rime-key" + assert config["params"]["speaker"] == "speaker" + assert config["params"]["modelId"] == "mist" + + +def test_byok_fishaudio_tts_params() -> None: + agent = Agent(name="t").with_tts( + FishAudioTTS(key="fish-key", reference_id="ref", backend="speech-1.5") + ) + props = build_properties(agent, allow_missing={"asr", "llm"}) + assert props["tts"]["vendor"] == "fishaudio" + assert props["tts"]["params"]["api_key"] == "fish-key" + assert props["tts"]["params"]["reference_id"] == "ref" + + +def test_byok_minimax_byok_tts_params() -> None: + agent = Agent(name="t").with_tts( + MiniMaxTTS( + key="mm-key", + group_id="group", + model="speech-02-turbo", + voice_id="voice", + url="wss://api-uw.minimax.io/ws/v1/t2a_v2", + ) + ) + props = build_properties(agent, allow_missing={"asr", "llm"}) + assert props["tts"]["vendor"] == "minimax" + assert props["tts"]["params"]["key"] == "mm-key" + + +def test_byok_sarvam_tts_params() -> None: + agent = Agent(name="t").with_tts( + SarvamTTS(key="sarvam-key", speaker="anushka", target_language_code="en-IN", sample_rate=24000) + ) + props = build_properties(agent, allow_missing={"asr", "llm"}) + assert props["tts"]["vendor"] == "sarvam" + assert props["tts"]["params"]["api_subscription_key"] == "sarvam-key" + assert props["tts"]["params"]["speaker"] == "anushka" + + +def test_byok_murf_tts_params() -> None: + agent = Agent(name="t").with_tts(MurfTTS(key="murf-key", voice_id="Ariana")) + props = build_properties(agent, allow_missing={"asr", "llm"}) + assert props["tts"]["vendor"] == "murf" + assert props["tts"]["params"]["api_key"] == "murf-key" + assert props["tts"]["params"]["voiceId"] == "Ariana" + + +def test_start_session_google_tts_preserves_wire_aliases() -> None: + agent = full_agent_with_tts( + GoogleTTS( + key="{}", + voice_name="en-US-JennyNeural", + language_code="en-US", + sample_rate_hertz=24000, + ) + ) + + call = start_session(agent) + properties = dump_wire(call["properties"]) + params = properties["tts"]["params"] + + assert params["VoiceSelectionParams"]["name"] == "en-US-JennyNeural" + assert params["VoiceSelectionParams"]["language_code"] == "en-US" + assert params["AudioConfig"]["sample_rate_hertz"] == 24000 + assert "voice_selection_params" not in params + assert "audio_config" not in params + + +def test_start_session_rime_tts_preserves_wire_aliases() -> None: + agent = full_agent_with_tts(RimeTTS(key="rime-key", speaker="speaker", model_id="mist")) + + call = start_session(agent) + properties = dump_wire(call["properties"]) + params = properties["tts"]["params"] + + assert params["modelId"] == "mist" + assert "model_id" not in params + + +def test_start_session_murf_tts_preserves_wire_aliases() -> None: + agent = full_agent_with_tts(MurfTTS(key="murf-key", voice_id="Ariana")) + + call = start_session(agent) + properties = dump_wire(call["properties"]) + params = properties["tts"]["params"] + + assert params["voiceId"] == "Ariana" + assert "voice_id" not in params + + +@pytest.mark.asyncio +async def test_async_start_session_google_tts_preserves_wire_aliases() -> None: + agent = full_agent_with_tts( + GoogleTTS( + key="{}", + voice_name="en-US-JennyNeural", + language_code="en-US", + sample_rate_hertz=24000, + ) + ) + + call = await start_async_session(agent) + properties = dump_wire(call["properties"]) + params = properties["tts"]["params"] + + assert params["VoiceSelectionParams"]["name"] == "en-US-JennyNeural" + assert params["VoiceSelectionParams"]["language_code"] == "en-US" + assert params["AudioConfig"]["sample_rate_hertz"] == 24000 + assert "voice_selection_params" not in params + assert "audio_config" not in params + + +def test_start_session_managed_minimax_tts_keeps_partial_preset_config() -> None: + agent = ( + Agent(name="support") + .with_llm(OpenAI(model="gpt-4o-mini")) + .with_tts(MiniMaxTTS(model="speech_2_8_turbo", voice_id="English_captivating_female1")) + ) + + call = start_session(agent) + properties = dump_wire(call["properties"]) + + assert "minimax_speech_2_8_turbo" in (call["preset"] or "") + assert properties["tts"]["vendor"] == "minimax" + assert properties["tts"]["params"] == { + "voice_setting": {"voice_id": "English_captivating_female1"}, + } + + +# --------------------------------------------------------------------------- +# BYOK Vendor Coverage Matrix — MLLM vendors +# --------------------------------------------------------------------------- + + +def test_byok_openai_realtime_mllm_params() -> None: + agent = Agent(name="t").with_mllm( + OpenAIRealtime(api_key="realtime-key", model="gpt-4o-realtime-preview", voice="coral") + ) + props = build_properties(agent) + assert props["mllm"]["vendor"] == "openai" + assert props["mllm"]["api_key"] == "realtime-key" + assert props["mllm"]["params"]["model"] == "gpt-4o-realtime-preview" + assert props["mllm"]["params"]["voice"] == "coral" + + +def test_byok_gemini_live_mllm_params() -> None: + agent = Agent(name="t").with_mllm( + GeminiLive(api_key="gemini-key", model="gemini-live-2.5-flash") + ) + props = build_properties(agent) + assert props["mllm"]["vendor"] == "gemini" + assert props["mllm"]["api_key"] == "gemini-key" + assert props["mllm"]["params"]["model"] == "gemini-live-2.5-flash" + + +def test_byok_vertex_ai_mllm_params() -> None: + agent = Agent(name="t").with_mllm( + VertexAI( + project_id="my-project", + location="us-central1", + adc_credentials_string="{}", + model="gemini-live-2.5-flash", + ) + ) + props = build_properties(agent) + assert props["mllm"]["vendor"] == "vertexai" + assert props["mllm"]["project_id"] == "my-project" + assert props["mllm"]["location"] == "us-central1" + assert props["mllm"]["adc_credentials_string"] == "{}" + assert props["mllm"]["params"]["model"] == "gemini-live-2.5-flash" + + +def test_byok_xai_grok_mllm_params() -> None: + agent = Agent(name="t").with_mllm(XaiGrok(api_key="xai-key")) + props = build_properties(agent) + assert props["mllm"]["vendor"] == "xai" + assert props["mllm"]["api_key"] == "xai-key" + + +# =========================================================================== +# Preset Coverage Matrix +# =========================================================================== + + +def test_preset_deepgram_nova_2_inferred() -> None: + tts = MiniMaxTTS(model="speech_2_8_turbo", voice_id="voice") + preset, properties = resolve_session_presets(None, {"asr": DeepgramSTT(model="nova-2", language="en").to_config(), "tts": tts.to_config()}) + assert preset is not None and "deepgram_nova_2" in preset + + +def test_preset_deepgram_nova_3_inferred() -> None: + tts = MiniMaxTTS(model="speech_2_8_turbo", voice_id="voice") + preset, properties = resolve_session_presets(None, {"asr": DeepgramSTT(model="nova-3", language="en").to_config(), "tts": tts.to_config()}) + assert preset is not None and "deepgram_nova_3" in preset + + +def test_preset_openai_gpt_4o_mini_inferred() -> None: + tts = MiniMaxTTS(model="speech_2_8_turbo", voice_id="voice") + preset, properties = resolve_session_presets(None, {"llm": OpenAI(model="gpt-4o-mini").to_config(), "tts": tts.to_config()}) + assert preset is not None and "openai_gpt_4o_mini" in preset + + +def test_preset_openai_tts_1_inferred() -> None: + preset, properties = resolve_session_presets(None, {"tts": OpenAITTS(voice="alloy").to_config()}) + assert preset == "openai_tts_1" + assert properties["tts"]["vendor"] == "openai" + + +def test_preset_minimax_speech_2_8_turbo_inferred() -> None: + preset, properties = resolve_session_presets(None, {"tts": MiniMaxTTS(model="speech_2_8_turbo", voice_id="voice").to_config()}) + assert preset == "minimax_speech_2_8_turbo" + + +def test_preset_minimax_speech_2_6_turbo_inferred() -> None: + preset, properties = resolve_session_presets(None, {"tts": MiniMaxTTS(model="speech-2.6-turbo", voice_id="voice").to_config()}) + assert preset == "minimax_speech_2_6_turbo" + + +def test_explicit_minimax_preset_strips_internal_hint() -> None: + """Explicit MiniMax TTS preset must not leak _minimax_preset_model to the wire.""" + # When the caller supplies the preset explicitly, inference is skipped but the + # internal _minimax_preset_model hint set by MiniMaxTTS.to_config() must still + # be removed before the POST body is sent. + tts_config = MiniMaxTTS(model="speech_2_8_turbo", voice_id="voice").to_config() + assert "_minimax_preset_model" in tts_config # confirm the hint is set pre-strip + + _, properties = resolve_session_presets("minimax_speech_2_8_turbo", {"tts": tts_config}) + assert "_minimax_preset_model" not in properties["tts"] diff --git a/tests/custom/test_stt_language.py b/tests/custom/test_stt_language.py index c398e02..0ea3c7d 100644 --- a/tests/custom/test_stt_language.py +++ b/tests/custom/test_stt_language.py @@ -7,8 +7,10 @@ DeepgramSTT, ElevenLabsTTS, GoogleSTT, + MicrosoftSTT, OpenAI, OpenAISTT, + SarvamSTT, SpeechmaticsSTT, TurnDetectionConfig, ) @@ -39,20 +41,20 @@ def properties(agent: Agent) -> dict: ) -def test_bcp47_stt_language_sets_turn_detection_language_and_provider_param() -> None: - props = properties(base_agent().with_stt(SpeechmaticsSTT(api_key="stt-key", language="en-US"))) +def test_bcp47_stt_language_stays_in_asr_params_and_defaults_turn_detection() -> None: + props = properties(base_agent().with_stt(SpeechmaticsSTT(api_key="stt-key", language="en"))) assert props["asr"]["vendor"] == "speechmatics" - assert "language" not in props["asr"] + assert props["asr"]["language"] == "en-US" assert props["turn_detection"]["language"] == "en-US" - assert props["asr"]["params"]["language"] == "en-US" + assert props["asr"]["params"]["language"] == "en" -def test_provider_language_defaults_turn_detection_language_when_not_supported_by_ares() -> None: +def test_provider_language_does_not_set_turn_detection_language() -> None: props = properties(base_agent().with_stt(SpeechmaticsSTT(api_key="stt-key", language="en"))) assert props["asr"]["vendor"] == "speechmatics" - assert "language" not in props["asr"] + assert props["asr"]["language"] == "en-US" assert props["turn_detection"]["language"] == "en-US" assert props["asr"]["params"]["language"] == "en" @@ -66,24 +68,26 @@ def test_turn_detection_language_can_differ_from_provider_language() -> None: ) assert props["turn_detection"]["language"] == "fr-FR" - assert "language" not in props["asr"] + assert props["asr"]["language"] == "fr-FR" assert props["asr"]["params"]["language"] == "en" def test_invalid_turn_detection_language_is_rejected() -> None: - with pytest.raises(ValueError, match="Invalid interaction language: en"): - properties(Agent(turn_detection=TurnDetectionConfig(language="en"))) # type: ignore[arg-type] + with pytest.raises(ValueError, match="Invalid turn_detection.language: xx"): + properties(Agent(turn_detection=TurnDetectionConfig(language="xx"))) # type: ignore[arg-type] def test_default_turn_detection_language_is_sent_without_stt() -> None: props = properties(base_agent()) - assert props["asr"] == {"vendor": "ares"} + assert props["asr"] == {"vendor": "ares", "language": "en-US"} assert props["turn_detection"] == {"language": "en-US"} def test_stt_vendor_params_match_documented_shapes() -> None: - assert DeepgramSTT(model="nova-3", language="en-US").to_config()["params"] == { + deepgram_managed = DeepgramSTT(model="nova-3", language="en-US").to_config() + assert "language" not in deepgram_managed + assert deepgram_managed["params"] == { "model": "nova-3", "language": "en-US", } @@ -96,20 +100,33 @@ def test_stt_vendor_params_match_documented_shapes() -> None: "language": "en", } - assert OpenAISTT(api_key="openai-key", model="gpt-4o-mini-transcribe", language="en").to_config()["params"] == { + # api_key → wire key "key"; keyterm passes through unchanged + assert DeepgramSTT(api_key="dg-key", model="nova-3", language="en", keyterm="term").to_config()["params"] == { + "key": "dg-key", + "model": "nova-3", + "language": "en", + "keyterm": "term", + } + + assert OpenAISTT( + api_key="openai-key", + model="gpt-4o-mini-transcribe", + language="en", + prompt="Transcribe English speech", + ).to_config()["params"] == { "api_key": "openai-key", "input_audio_transcription": { "model": "gpt-4o-mini-transcribe", "language": "en", + "prompt": "Transcribe English speech", }, } - assert OpenAISTT(api_key="openai-key").to_config()["params"] == { - "api_key": "openai-key", - "input_audio_transcription": { - "model": "whisper-1", - }, - } + with pytest.raises(ValueError, match="prompt is required"): + OpenAISTT(api_key="openai-key", language="en").to_config() + + with pytest.raises(ValueError, match="language is required"): + OpenAISTT(api_key="openai-key", prompt="Transcribe speech").to_config() assert GoogleSTT( project_id="project", @@ -132,8 +149,46 @@ def test_stt_vendor_params_match_documented_shapes() -> None: "language_code": "en-US", } - assert AssemblyAISTT(api_key="assembly-key", language="en-US", uri="wss://example.test/ws").to_config()["params"] == { + assemblyai_config = AssemblyAISTT(api_key="assembly-key", language="en-US", uri="wss://example.test/ws").to_config() + assert "language" not in assemblyai_config + assert assemblyai_config["params"] == { "api_key": "assembly-key", "language": "en-US", "uri": "wss://example.test/ws", } + + assert MicrosoftSTT(key="ms-key", region="eastus", language="en-US").to_config()["params"] == { + "key": "ms-key", + "region": "eastus", + "language": "en-US", + } + + assert SpeechmaticsSTT(api_key="sm-key", language="en").to_config()["params"] == { + "api_key": "sm-key", + "language": "en", + } + + assert SarvamSTT(api_key="sarvam-key", language="en-IN").to_config()["params"] == { + "api_key": "sarvam-key", + "language": "en-IN", + } + + +def test_assemblyai_params_stay_nested_and_asr_language_comes_from_turn_detection() -> None: + props = properties( + Agent(turn_detection=TurnDetectionConfig(language="fr-FR")) + .with_llm(OpenAI(api_key="llm-key", model="gpt-4o-mini", base_url="https://api.openai.com/v1/chat/completions")) + .with_tts(ElevenLabsTTS(key="tts-key", voice_id="voice", model_id="eleven_flash_v2_5", base_url="wss://api.elevenlabs.io/v1")) + .with_stt(AssemblyAISTT(api_key="assembly-key", language="en-US", uri="wss://example.test/ws")) + ) + + assert props["asr"] == { + "vendor": "assemblyai", + "language": "fr-FR", + "params": { + "api_key": "assembly-key", + "language": "en-US", + "uri": "wss://example.test/ws", + }, + } + assert props["turn_detection"] == {"language": "fr-FR"} diff --git a/tests/custom/test_tts_vendors.py b/tests/custom/test_tts_vendors.py index bdd9482..11e3f35 100644 --- a/tests/custom/test_tts_vendors.py +++ b/tests/custom/test_tts_vendors.py @@ -1,9 +1,18 @@ import pytest -from agora_agent import AmazonTTS, CartesiaTTS, DeepgramTTS, ElevenLabsTTS, FishAudioTTS, GoogleTTS, HumeAITTS, MiniMaxTTS, MurfTTS, OpenAITTS, RimeTTS, SarvamTTS +from agora_agent import AmazonTTS, CartesiaTTS, DeepgramTTS, ElevenLabsTTS, FishAudioTTS, GoogleTTS, HumeAITTS, MicrosoftTTS, MiniMaxTTS, MurfTTS, OpenAITTS, RimeTTS, SarvamTTS +from agora_agent.agents.types.start_agents_request_properties import StartAgentsRequestProperties +from agora_agent.core.jsonable_encoder import jsonable_encoder +from agora_agent.core.pydantic_utilities import parse_obj_as def test_tts_vendor_params_match_generated_core_shapes() -> None: + assert MicrosoftTTS(key="ms-key", region="eastus", voice_name="en-US-JennyNeural").to_config()["params"] == { + "key": "ms-key", + "region": "eastus", + "voice_name": "en-US-JennyNeural", + } + assert AmazonTTS(access_key="access", secret_key="secret", region="us-east-1", voice_id="Joanna", engine="neural").to_config()["params"] == { "aws_access_key_id": "access", "aws_secret_access_key": "secret", @@ -116,3 +125,33 @@ def test_tts_managed_mode_validation_matches_core_shapes() -> None: with pytest.raises(Exception, match="MiniMaxTTS requires key unless using a supported Agora-managed model"): MiniMaxTTS(model="unsupported-model") + + +def test_tts_wire_serialization_applies_fern_aliases() -> None: + """Verify alias-sensitive TTS params keep the exact provider wire keys.""" + _BASE = dict(channel="ch", token="tok", agent_rtc_uid="1", remote_rtc_uids=["100"]) + + google_config = GoogleTTS( + key="{}", voice_name="en-US-JennyNeural", language_code="en-US", sample_rate_hertz=24000 + ).to_config() + assert "VoiceSelectionParams" in google_config["params"] + google_wire = jsonable_encoder(parse_obj_as(StartAgentsRequestProperties, {**_BASE, "tts": google_config})) + google_params = google_wire["tts"]["params"] + assert "VoiceSelectionParams" in google_params, f"wire missing VoiceSelectionParams, got: {list(google_params)}" + assert "voice_selection_params" not in google_params + assert "AudioConfig" in google_params + assert "audio_config" not in google_params + + rime_config = RimeTTS(key="rime-key", speaker="speaker", model_id="mist").to_config() + assert "modelId" in rime_config["params"] + rime_wire = jsonable_encoder(parse_obj_as(StartAgentsRequestProperties, {**_BASE, "tts": rime_config})) + rime_params = rime_wire["tts"]["params"] + assert "modelId" in rime_params, f"wire missing modelId, got: {list(rime_params)}" + assert "model_id" not in rime_params + + murf_config = MurfTTS(key="murf-key", voice_id="Ariana").to_config() + assert "voiceId" in murf_config["params"] + murf_wire = jsonable_encoder(parse_obj_as(StartAgentsRequestProperties, {**_BASE, "tts": murf_config})) + murf_params = murf_wire["tts"]["params"] + assert "voiceId" in murf_params, f"wire missing voiceId, got: {list(murf_params)}" + assert murf_params["voiceId"] == "Ariana"