From b66d871314ca0e5929cb9c9095949a7fd5e856a7 Mon Sep 17 00:00:00 2001 From: fern-api <115122769+fern-api[bot]@users.noreply.github.com> Date: Thu, 4 Jun 2026 20:30:41 +0000 Subject: [PATCH 1/2] [fern-generated] Update SDK Generated by Fern CLI Version: unknown Generators: - fernapi/fern-python-sdk: 4.37.0 --- scripts/check_release_workflow.py | 54 ------------------- ...gents_request_properties_turn_detection.py | 6 --- src/agora_agent/core/client_wrapper.py | 4 +- src/agora_agent/types/asr.py | 1 - src/agora_agent/types/deepgram_asr.py | 4 -- src/agora_agent/types/deepgram_asr_params.py | 2 +- 6 files changed, 3 insertions(+), 68 deletions(-) delete mode 100644 scripts/check_release_workflow.py diff --git a/scripts/check_release_workflow.py b/scripts/check_release_workflow.py deleted file mode 100644 index 1a6e065..0000000 --- a/scripts/check_release_workflow.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 - -import re -import sys -from pathlib import Path -from typing import NoReturn - - -def fail(message: str) -> NoReturn: - print(message, file=sys.stderr) - raise SystemExit(1) - - -def read_version(path: str) -> str: - text = Path(path).read_text() - match = re.search(r'^version\s*=\s*"v?([^"]+)"', text, re.M) - if not match: - fail(f"version not found in {path}") - return match.group(1) - - -def read_compat_dependency(path: str) -> str: - text = Path(path).read_text() - match = re.search(r'^agora-agents\s*=\s*"([^"]+)"', text, re.M) - if not match: - fail(f"agora-agents dependency not found in {path}") - return match.group(1) - - -root_version = read_version("pyproject.toml") -compat_pyproject = "compat/agora-agent-server-sdk/pyproject.toml" -compat_version = read_version(compat_pyproject) -compat_dependency = read_compat_dependency(compat_pyproject) - -if compat_version != root_version: - fail(f"Compat package version ({compat_version}) must match root package version ({root_version}).") - -expected_dependency = f">={root_version},<3.0.0" -if compat_dependency != expected_dependency: - fail(f"Compat package dependency on agora-agents ({compat_dependency}) must be {expected_dependency}.") - -release_workflow = Path(".github/workflows/release.yml").read_text() -required_workflow_markers = [ - ("contents: write", "release workflow must have contents: write so it can create GitHub releases"), - ("gh release create", "release workflow must create a GitHub release when one does not exist"), - ("gh release edit", "release workflow must update an existing GitHub release"), - ("release_notes.md", "release workflow must generate and use a release notes file"), -] - -for marker, message in required_workflow_markers: - if marker not in release_workflow: - fail(message) - -print("Release metadata and workflow checks passed.") diff --git a/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py b/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py index fb58a36..40dbb02 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py @@ -5,7 +5,6 @@ import pydantic from ...core.pydantic_utilities import IS_PYDANTIC_V2 from ...core.unchecked_base_model import UncheckedBaseModel -from ...types.asr_language import AsrLanguage from .start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig from .start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness from .start_agents_request_properties_turn_detection_interrupt_mode import ( @@ -19,11 +18,6 @@ class StartAgentsRequestPropertiesTurnDetection(UncheckedBaseModel): Conversation turn detection settings. Controls the logic for voice activity detection and conversation turn determination. This object has no effect when `mllm.enable` is true; use `mllm.turn_detection` instead. """ - language: typing.Optional[AsrLanguage] = pydantic.Field(default=None) - """ - BCP-47 language tag identifying the primary language used for agent interaction. - """ - mode: typing.Optional[typing.Literal["default"]] = pydantic.Field(default=None) """ Conversation turn detection mode: diff --git a/src/agora_agent/core/client_wrapper.py b/src/agora_agent/core/client_wrapper.py index a8efe07..2df9814 100644 --- a/src/agora_agent/core/client_wrapper.py +++ b/src/agora_agent/core/client_wrapper.py @@ -26,10 +26,10 @@ def __init__( def get_headers(self) -> typing.Dict[str, str]: headers: typing.Dict[str, str] = { - "User-Agent": "agora-agents/v2.1.1", + "User-Agent": "agora-agents/v2.1.2", "X-Fern-Language": "Python", "X-Fern-SDK-Name": "agora-agents", - "X-Fern-SDK-Version": "v2.1.1", + "X-Fern-SDK-Version": "v2.1.2", **(self.get_custom_headers() or {}), } headers["Authorization"] = httpx.BasicAuth(self._get_username(), self._get_password())._auth_header diff --git a/src/agora_agent/types/asr.py b/src/agora_agent/types/asr.py index f08086f..1f2225d 100644 --- a/src/agora_agent/types/asr.py +++ b/src/agora_agent/types/asr.py @@ -54,7 +54,6 @@ class Asr_Deepgram(UncheckedBaseModel): vendor: typing.Literal["deepgram"] = "deepgram" language: typing.Optional[AsrLanguage] = None params: DeepgramAsrParams - keyterm: typing.Optional[str] = None if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 diff --git a/src/agora_agent/types/deepgram_asr.py b/src/agora_agent/types/deepgram_asr.py index 1c79c7b..723cd86 100644 --- a/src/agora_agent/types/deepgram_asr.py +++ b/src/agora_agent/types/deepgram_asr.py @@ -16,10 +16,6 @@ class DeepgramAsr(UncheckedBaseModel): language: typing.Optional[AsrLanguage] = None params: DeepgramAsrParams - keyterm: typing.Optional[str] = pydantic.Field(default=None) - """ - Boost specialized terms and brands for preset-backed Deepgram usage. - """ if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 diff --git a/src/agora_agent/types/deepgram_asr_params.py b/src/agora_agent/types/deepgram_asr_params.py index 259958e..6688333 100644 --- a/src/agora_agent/types/deepgram_asr_params.py +++ b/src/agora_agent/types/deepgram_asr_params.py @@ -34,7 +34,7 @@ class DeepgramAsrParams(UncheckedBaseModel): keyterm: typing.Optional[str] = pydantic.Field(default=None) """ - Boost specialized terms and brands + Boost specialized terms and brands for Deepgram. """ if IS_PYDANTIC_V2: From 83e9b9c3fc79f1d7a578de641494a08e54a4468b Mon Sep 17 00:00:00 2001 From: fern-api <115122769+fern-api[bot]@users.noreply.github.com> Date: Thu, 4 Jun 2026 20:31:14 +0000 Subject: [PATCH 2/2] [fern-replay] Applied customizations Patches applied (14): - patch-7c2d9d99: feat(agentkit): align session options and token uid handling - patch-7465fada: fix(agentkit): resolve Python session typing issues - patch-fae1249a: Re-export agora-agents API from legacy PyPI compatibility package The compat distribution delegates to agora_agent via __getattr__ and documents both import paths in its README. - patch-49af6f65: Align AgentKit TTS provider options with docs - patch-bad47d96: Align AgentKit provider BYOK parameter requirements - patch-434c8af1: Align AgentKit LLM and ASR vendor validation - patch-968e1f03: Restrict managed OpenAI LLM models in AgentKit - patch-676b93b3: Align managed vendor validation with generated core shapes - patch-8d52340e: fix(agentkit): flatten Deepgram TTS passthrough params - patch-cb9ab8b8: docs(agentkit): align OpenAI TTS instructions support - patch-299e4bd9: fix(agentkit): resolve provider config type checks - patch-583eccc0: Move AgentKit language to turn detection - patch-bed29b6b: chore: bump Python packages to 2.1.0 - patch-776f7c4a: Fix vendor validation matrix for presets, pipeline_id, and deprecation path Patches with unresolved conflicts (28): - patch-6e30398b: chore(agentkit): bump to v1.5.0 and expose v2.7 type aliases - patch-9df782b4: feat(agentkit): update MLLM and LLM vendor wrappers for v2.7 - patch-26706d73: feat(agentkit): add GenericAvatar and session-aware avatar validation - patch-9f491c63: feat(agentkit): update Agent builder and session lifecycle for v2.7 - patch-64703bda: test(agentkit): add custom tests for v1.5.0 AgentKit behavior - patch-6c20f076: docs(agentkit): update v1.5.0 guides, reference, and changelog - patch-eaec58eb: refactor(agentkit): align deprecated vendor aliases with canonical names - patch-20245632: feat(agentkit): export type aliases and avatar token helpers - patch-972dd5bd: updated docs - patch-4323b470: rename python package to agora-agents - patch-d29165c4: make python compat package publishable - patch-fc9d93c3: Document agora-agents PyPI install name and migration notes - patch-44c21c14: Re-export AgentKit symbols from agora_agent package root Extend __getattr__ and __all__ so vendor classes, presets, and helpers are importable via `from agora_agent import ...`. Add tests and update class docstring examples to use the root import path. - patch-87fc4488: Update docs to import from agora_agent package root - patch-923cf954: Prioritize app credentials and builder in Python docs Rewrite getting-started auth and quick-start for app credentials with the builder API. De-emphasize presets and align index, BYOK, and README with the recommended onboarding path. - patch-d475306b: Move package rename guidance to installation docs and protect manual paths in Fern ignore. Consolidate migration notes into the installation guide with next-step links, add a brief README pointer, and exclude README, compat, and workflow files from Fern generation. - patch-c9355576: Streamline Python docs and README for app-credentials-first onboarding. Remove duplicated low-level client examples from the README, de-emphasize legacy auth modes, refocus the low-level API guide on AgentKit with telephony escape hatches, and update Agora-managed model terminology. - patch-98ecb4d3: Add Groq, Vertex AI, Bedrock, Dify, and Custom LLM vendor helpers. Introduce named LLM vendor classes with correct request serialization, export them from the package root, and add tests covering each provider's config shape. - patch-a5097b8d: Document new LLM vendors and tighten onboarding docs. Add Groq, Vertex AI, Bedrock, Dify, and Custom LLM to vendor references, simplify README and index navigation, and align quick-start and terminology with Agora-managed model language. - patch-7d30c9dd: Add AgentKit ASR interaction language handling - patch-a95214eb: Document AgentKit ASR language and STT params - patch-eeac05d0: Move prompt and greeting docs to vendor config - patch-a94bac6d: Align AgentKit provider wrappers with regenerated core schemas - patch-198f367f: Update AgentKit TTS provider docs and examples - patch-96afe786: align v2.1 provider docs with AgentKit validation - patch-617ee134: feat(agentkit): support agent-level pipeline_id - patch-8e22e6d0: udpated agent docs - patch-b76a7006: Bump Python SDK version metadata and request headers to v2.1.1 Run `fern-replay resolve` to apply these customizations. Patches absorbed by generator (5): - patch-b7f0c36c: feat(agentkit): release v2.0.0 updates - patch-4d32368c: Add compat-build CI job and harden dual-package PyPI publish Build and verify the compat wheel re-exports, gate publish on compat-build, simplify version checks with poetry version, wait for primary package on PyPI, and retry compat publish on failure. - patch-20109390: Fix PyPI publish auth and explicitly protect release workflow in Fern ignore. Use PYPI_API_TOKEN for primary and compat Poetry publishes, matching the v1.4.1 release flow, and list release.yml explicitly in .fernignore. - patch-0297a70e: Update AgentKit v2.1 provider docs and examples - patch-c9022354: docs(agentkit): align TTS provider reference fields The generator now produces these customizations natively. --- .fern/replay.lock | 17991 +++++++++++++++- docs/reference/vendors.md | 1 + scripts/check_release_workflow.py | 54 + src/agora_agent/agentkit/agent.py | 19 + src/agora_agent/agentkit/agent_session.py | 1 + src/agora_agent/agentkit/vendors/avatar.py | 43 + src/agora_agent/agentkit/vendors/llm.py | 5 +- src/agora_agent/agentkit/vendors/mllm.py | 1 + src/agora_agent/agentkit/vendors/stt.py | 3 + src/agora_agent/agentkit/vendors/tts.py | 6 + ...gents_request_properties_turn_detection.py | 6 + tests/custom/test_agentkit_agent.py | 298 + tests/custom/test_agentkit_session.py | 383 + tests/custom/test_agentkit_vendors.py | 122 + 14 files changed, 18930 insertions(+), 3 deletions(-) create mode 100644 scripts/check_release_workflow.py create mode 100644 tests/custom/test_agentkit_agent.py create mode 100644 tests/custom/test_agentkit_session.py create mode 100644 tests/custom/test_agentkit_vendors.py diff --git a/.fern/replay.lock b/.fern/replay.lock index 536b6e4..a435ef4 100644 --- a/.fern/replay.lock +++ b/.fern/replay.lock @@ -6,5 +6,17992 @@ generations: timestamp: 2026-05-20T20:38:02.180Z cli_version: unknown generator_versions: {} -current_generation: a217c8ecfd919345831eebaca8295e292d65ebcf -patches: [] + - commit_sha: b66d871314ca0e5929cb9c9095949a7fd5e856a7 + tree_hash: db7756fbc0a5c6923371615dd752c8e17b2d828b + timestamp: 2026-06-04T20:30:41.901Z + cli_version: unknown + generator_versions: + fernapi/fern-python-sdk: 4.37.0 +current_generation: b66d871314ca0e5929cb9c9095949a7fd5e856a7 +patches: + - id: patch-6e30398b + content_hash: sha256:e99898e508e2d6cb9f134cc33e0b73c1c8acb845f5887924e0e38031a6e089c0 + original_commit: 6e30398b5dc6e8ff2681a442a4d6a49c7d866032 + original_message: "chore(agentkit): bump to v1.5.0 and expose v2.7 type aliases" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/__init__.py + - src/agora_agent/agentkit/vendors/__init__.py + patch_content: | + diff --git a/src/agora_agent/agentkit/__init__.py b/src/agora_agent/agentkit/__init__.py + index 1942bce..5ceda66 100644 + --- a/src/agora_agent/agentkit/__init__.py + +++ b/src/agora_agent/agentkit/__init__.py + @@ -1,13 +1,30 @@ + from .agent import ( + Agent, + + AgentConfig, + + AgentConfigUpdate, + + ConversationHistory, + + ConversationRole, + + ConversationSessionTurn, + + ConversationTurn, + + ConversationTurns, + StartAgentsRequestProperties, + + AvatarConfig, + + AvatarVendor, + GeofenceConfig, + + LlmConfig, + + LlmStyle, + + MllmConfig, + + MllmVendor, + RtcConfig, + + SttConfig, + + SttVendor, + + TtsConfig, + FillerWordsConfig, + FillerWordsTrigger, + FillerWordsTriggerFixedTimeConfig, + FillerWordsContent, + FillerWordsContentStaticConfig, + + FillerWordsContentSelectionRule, + TurnDetectionConfig, + TurnDetectionNestedConfig, + StartOfSpeechConfig, + @@ -37,9 +54,14 @@ from .agent import ( + InterruptionMode, + MllmTurnDetectionConfig, + MllmTurnDetectionMode, + + Labels, + LlmGreetingConfigs, + LlmGreetingConfigsMode, + McpServersItem, + + SessionInfo, + + SessionListResponse, + + SessionSummary, + + SpeakPriority, + ) + from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from ..agent_management.types.agent_think_agent_management_response import ( + @@ -57,8 +79,10 @@ from ..agent_management.types.agent_think_agent_management_request_on_speaking_a + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + @@ -112,6 +136,7 @@ from .vendors import ( + FishAudioTTS, + Gemini, + GeminiLive, + + GenericAvatar, + GoogleSTT, + GoogleTTS, + HeyGenAvatar, + @@ -132,14 +157,27 @@ from .vendors import ( + SarvamTTS, + SpeechmaticsSTT, + VertexAI, + + XaiGrok, + + XaiRealtime, + LiveAvatarAvatar, + ) + + __all__ = [ + "Agent", + + "AgentConfig", + + "AgentConfigUpdate", + # Return type of Agent.to_properties() + "StartAgentsRequestProperties", + # Top-level config types + + "LlmConfig", + + "LlmStyle", + + "SttConfig", + + "SttVendor", + + "TtsConfig", + + "MllmConfig", + + "MllmVendor", + + "AvatarConfig", + + "AvatarVendor", + "GeofenceConfig", + "RtcConfig", + "FillerWordsConfig", + @@ -147,6 +185,7 @@ __all__ = [ + "FillerWordsTriggerFixedTimeConfig", + "FillerWordsContent", + "FillerWordsContentStaticConfig", + + "FillerWordsContentSelectionRule", + # Turn detection types + "TurnDetectionConfig", + "TurnDetectionNestedConfig", + @@ -181,6 +220,7 @@ __all__ = [ + "InterruptionMode", + "MllmTurnDetectionConfig", + "MllmTurnDetectionMode", + + "Labels", + # Type-safe constants + "DataChannel", + "AudioScenario", + @@ -197,6 +237,15 @@ __all__ = [ + "AgentSession", + "AsyncAgentSession", + "AgentSessionOptions", + + "SessionInfo", + + "SessionListResponse", + + "SessionSummary", + + "ConversationHistory", + + "ConversationTurn", + + "ConversationRole", + + "ConversationTurns", + + "ConversationSessionTurn", + + "SpeakPriority", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + @@ -253,14 +302,19 @@ __all__ = [ + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + + "XaiGrok", + + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + + "GenericAvatar", + "is_heygen_avatar", + "is_live_avatar_avatar", + "is_akool_avatar", + "is_anam_avatar", + + "is_generic_avatar", + + "is_rtc_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + diff --git a/src/agora_agent/agentkit/vendors/__init__.py b/src/agora_agent/agentkit/vendors/__init__.py + index 0320843..689eab1 100644 + --- a/src/agora_agent/agentkit/vendors/__init__.py + +++ b/src/agora_agent/agentkit/vendors/__init__.py + @@ -11,9 +11,9 @@ from .base import ( + OpenAISampleRate, + SampleRate, + ) + -from .avatar import AkoolAvatar, AnamAvatar, HeyGenAvatar, LiveAvatarAvatar + +from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + -from .mllm import GeminiLive, OpenAIRealtime, VertexAI + +from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok, XaiRealtime + from .stt import ( + AmazonSTT, + AresSTT, + @@ -82,8 +82,11 @@ __all__ = [ + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + + "XaiGrok", + + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + + "GenericAvatar", + ] + theirs_snapshot: + src/agora_agent/agentkit/__init__.py: | + from .agent import ( + Agent, + AgentConfig, + AgentConfigUpdate, + ConversationHistory, + ConversationRole, + ConversationSessionTurn, + ConversationTurn, + ConversationTurns, + StartAgentsRequestProperties, + AvatarConfig, + AvatarVendor, + GeofenceConfig, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, + RtcConfig, + SttConfig, + SttVendor, + TtsConfig, + FillerWordsConfig, + FillerWordsTrigger, + FillerWordsTriggerFixedTimeConfig, + FillerWordsContent, + FillerWordsContentStaticConfig, + FillerWordsContentSelectionRule, + TurnDetectionConfig, + TurnDetectionNestedConfig, + StartOfSpeechConfig, + StartOfSpeechMode, + StartOfSpeechVadConfig, + StartOfSpeechKeywordsConfig, + StartOfSpeechDisabledConfig, + StartOfSpeechDisabledConfigStrategy, + EndOfSpeechConfig, + EndOfSpeechMode, + EndOfSpeechVadConfig, + EndOfSpeechSemanticConfig, + TurnDetectionType, + InterruptMode, + Eagerness, + SalConfig, + SalMode, + AdvancedFeatures, + SessionParams, + SessionParamsInput, + SilenceConfig, + SilenceAction, + FarewellConfig, + ParametersDataChannel, + ParametersAudioScenario, + InterruptionConfig, + InterruptionMode, + MllmTurnDetectionConfig, + MllmTurnDetectionMode, + Labels, + LlmGreetingConfigs, + LlmGreetingConfigsMode, + McpServersItem, + SessionInfo, + SessionListResponse, + SessionSummary, + SpeakPriority, + ) + from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .constants import ( + DataChannel, + AudioScenario, + SilenceActionValues, + SalModeValues, + GeofenceArea, + GeofenceExcludeArea, + FillerWordsSelectionRule, + TurnDetectionTypeValues, + ) + from .token import ( + GenerateConvoAITokenOptions, + GenerateTokenOptions, + MAX_EXPIRY_SECONDS, + generate_convo_ai_token, + generate_rtc_token, + expires_in_hours, + expires_in_minutes, + ) + from .presets import ( + AgentPresets, + DeepgramPresetModels, + MiniMaxPresetModels, + OpenAIPresetModels, + OpenAITtsPresetModels, + normalize_preset_input, + ) + from .vendors import ( + AkoolAvatar, + AmazonSTT, + AmazonTTS, + AnamAvatar, + Anthropic, + AresSTT, + AssemblyAISTT, + AzureOpenAI, + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + CartesiaTTS, + DeepgramSTT, + DeepgramTTS, + ElevenLabsSampleRate, + ElevenLabsTTS, + FishAudioTTS, + Gemini, + GeminiLive, + GenericAvatar, + GoogleSTT, + GoogleTTS, + HeyGenAvatar, + HumeAITTS, + MicrosoftSampleRate, + MicrosoftSTT, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAI, + OpenAIRealtime, + OpenAISampleRate, + OpenAISTT, + OpenAITTS, + RimeTTS, + SampleRate, + SarvamSTT, + SarvamTTS, + SpeechmaticsSTT, + VertexAI, + XaiGrok, + XaiRealtime, + LiveAvatarAvatar, + ) + + __all__ = [ + "Agent", + "AgentConfig", + "AgentConfigUpdate", + # Return type of Agent.to_properties() + "StartAgentsRequestProperties", + # Top-level config types + "LlmConfig", + "LlmStyle", + "SttConfig", + "SttVendor", + "TtsConfig", + "MllmConfig", + "MllmVendor", + "AvatarConfig", + "AvatarVendor", + "GeofenceConfig", + "RtcConfig", + "FillerWordsConfig", + "FillerWordsTrigger", + "FillerWordsTriggerFixedTimeConfig", + "FillerWordsContent", + "FillerWordsContentStaticConfig", + "FillerWordsContentSelectionRule", + # Turn detection types + "TurnDetectionConfig", + "TurnDetectionNestedConfig", + "StartOfSpeechConfig", + "StartOfSpeechMode", + "StartOfSpeechVadConfig", + "StartOfSpeechKeywordsConfig", + "StartOfSpeechDisabledConfig", + "StartOfSpeechDisabledConfigStrategy", + "EndOfSpeechConfig", + "EndOfSpeechMode", + "EndOfSpeechVadConfig", + "EndOfSpeechSemanticConfig", + # Deprecated turn detection types + "TurnDetectionType", + "InterruptMode", + "Eagerness", + # SAL types + "SalConfig", + "SalMode", + # Advanced features + "AdvancedFeatures", + # Session parameters types + "SessionParams", + "SessionParamsInput", + "SilenceConfig", + "SilenceAction", + "FarewellConfig", + "ParametersDataChannel", + "ParametersAudioScenario", + "InterruptionConfig", + "InterruptionMode", + "MllmTurnDetectionConfig", + "MllmTurnDetectionMode", + "Labels", + # Type-safe constants + "DataChannel", + "AudioScenario", + "SilenceActionValues", + "SalModeValues", + "GeofenceArea", + "GeofenceExcludeArea", + "FillerWordsSelectionRule", + "TurnDetectionTypeValues", + # LLM sub-types + "LlmGreetingConfigs", + "LlmGreetingConfigsMode", + "McpServersItem", + "AgentSession", + "AsyncAgentSession", + "AgentSessionOptions", + "SessionInfo", + "SessionListResponse", + "SessionSummary", + "ConversationHistory", + "ConversationTurn", + "ConversationRole", + "ConversationTurns", + "ConversationSessionTurn", + "SpeakPriority", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + "AgentThinkRequestOnSpeakingAction", + "AgentPresets", + "DeepgramPresetModels", + "OpenAIPresetModels", + "OpenAITtsPresetModels", + "MiniMaxPresetModels", + "normalize_preset_input", + "generate_rtc_token", + "GenerateTokenOptions", + "generate_convo_ai_token", + "GenerateConvoAITokenOptions", + "MAX_EXPIRY_SECONDS", + "expires_in_hours", + "expires_in_minutes", + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + "is_heygen_avatar", + "is_live_avatar_avatar", + "is_akool_avatar", + "is_anam_avatar", + "is_generic_avatar", + "is_rtc_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + src/agora_agent/agentkit/vendors/__init__.py: | + from .base import ( + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + ElevenLabsSampleRate, + GoogleTTSSampleRate, + MicrosoftSampleRate, + OpenAISampleRate, + SampleRate, + ) + from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok, XaiRealtime + from .stt import ( + AmazonSTT, + AresSTT, + AssemblyAISTT, + DeepgramSTT, + GoogleSTT, + MicrosoftSTT, + OpenAISTT, + SarvamSTT, + SpeechmaticsSTT, + ) + from .tts import ( + AmazonTTS, + CartesiaTTS, + DeepgramTTS, + ElevenLabsTTS, + FishAudioTTS, + GoogleTTS, + HumeAITTS, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAITTS, + RimeTTS, + SarvamTTS, + ) + + __all__ = [ + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "GoogleTTSSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + ] + status: unresolved + - id: patch-9df782b4 + content_hash: sha256:84c08fe3239d2ecb0b0a3ddd33b0dce4e7b012125be797aa83ca12893363b565 + original_commit: 9df782b46d872599f103078e30c5ded2053f2517 + original_message: "feat(agentkit): update MLLM and LLM vendor wrappers for v2.7" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/vendors/llm.py + - src/agora_agent/agentkit/vendors/mllm.py + patch_content: |+ + From 9df782b46d872599f103078e30c5ded2053f2517 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 20 May 2026 20:57:54 -0400 + Subject: [PATCH] feat(agentkit): update MLLM and LLM vendor wrappers for v2.7 + + Adds xAI Grok Realtime and Vertex AI MLLM wrappers, and aligns MLLM + config serialization with the generated core types. LLM vendors now + accept typed greeting_configs and serialize them through the generated + model shape, including interruptable. + --- + src/agora_agent/agentkit/vendors/llm.py | 31 ++++-- + src/agora_agent/agentkit/vendors/mllm.py | 118 +++++++++++++++++------ + 2 files changed, 113 insertions(+), 36 deletions(-) + + diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py + index 7465c9f..6f74b43 100644 + --- a/src/agora_agent/agentkit/vendors/llm.py + +++ b/src/agora_agent/agentkit/vendors/llm.py + @@ -1,9 +1,14 @@ + -from typing import Any, Dict, List, Optional + +from typing import Any, Dict, List, Optional, Union + + from pydantic import BaseModel, ConfigDict, Field + + +from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + + StartAgentsRequestPropertiesLlmGreetingConfigs, + +) + from .base import BaseLLM + + +LlmGreetingConfigs = Union[StartAgentsRequestPropertiesLlmGreetingConfigs, Dict[str, Any]] + + + + def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Ensure each MCP server has transport set (API requires it). Default to streamable_http.""" + @@ -15,6 +20,14 @@ def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]] + result.append(item) + return result + + + + +def _dump_optional_model(value: Any) -> Any: + + if hasattr(value, "model_dump"): + + return value.model_dump(exclude_none=True) + + if hasattr(value, "dict"): + + return value.dict(exclude_none=True) + + return value + + + class OpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + @@ -31,7 +44,7 @@ class OpenAIOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -74,7 +87,7 @@ class OpenAI(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + @@ -104,7 +117,7 @@ class AzureOpenAIOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -150,7 +163,7 @@ class AzureOpenAI(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.mcp_servers is not None: + @@ -177,7 +190,7 @@ class AnthropicOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -216,7 +229,7 @@ class Anthropic(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + @@ -246,7 +259,7 @@ class GeminiOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -287,7 +300,7 @@ class Gemini(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py + index 5f6f940..cd6cd07 100644 + --- a/src/agora_agent/agentkit/vendors/mllm.py + +++ b/src/agora_agent/agentkit/vendors/mllm.py + @@ -1,3 +1,4 @@ + +import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + @@ -22,9 +23,7 @@ class OpenAIRealtimeOptions(BaseModel): + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + @@ -53,18 +52,97 @@ class OpenAIRealtime(BaseMLLM): + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + - if self.options.predefined_tools is not None: + - config["predefined_tools"] = self.options.predefined_tools + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + - if self.options.max_history is not None: + - config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + +# xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + +# is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + + + +class XaiGrokOptions(BaseModel): + + model_config = ConfigDict(extra="forbid") + + + + api_key: str = Field(..., description="xAI API key") + + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + + + +class XaiGrok(BaseMLLM): + + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + + + def __init__(self, **kwargs: Any): + + self.options = XaiGrokOptions(**kwargs) + + + + def to_config(self) -> Dict[str, Any]: + + params: Dict[str, Any] = dict(self.options.params or {}) + + if self.options.voice is not None: + + params["voice"] = self.options.voice + + if self.options.language is not None: + + params["language"] = self.options.language + + if self.options.sample_rate is not None: + + params["sample_rate"] = self.options.sample_rate + + + + config: Dict[str, Any] = { + + "vendor": "xai", + + "api_key": self.options.api_key, + + "url": self.options.url, + + "params": params, + + } + + + + if self.options.greeting_message is not None: + + config["greeting_message"] = self.options.greeting_message + + if self.options.input_modalities is not None: + + config["input_modalities"] = self.options.input_modalities + + if self.options.output_modalities is not None: + + config["output_modalities"] = self.options.output_modalities + + if self.options.messages is not None: + + config["messages"] = self.options.messages + + if self.options.failure_message is not None: + + config["failure_message"] = self.options.failure_message + + if self.options.turn_detection is not None: + + config["turn_detection"] = self.options.turn_detection + + + + return config + + + + + +class XaiRealtimeOptions(XaiGrokOptions): + + """Deprecated: use :class:`XaiGrokOptions` instead.""" + + + + def __init__(self, **data: Any): + + warnings.warn( + + "XaiRealtimeOptions is deprecated; use XaiGrokOptions instead.", + + DeprecationWarning, + + stacklevel=2, + + ) + + super().__init__(**data) + + + + + +class XaiRealtime(XaiGrok): + + """Deprecated: use :class:`XaiGrok` instead.""" + + + + def __init__(self, **kwargs: Any): + + warnings.warn( + + "XaiRealtime is deprecated; use XaiGrok instead.", + + DeprecationWarning, + + stacklevel=2, + + ) + + super().__init__(**kwargs) + + + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + @@ -81,28 +159,24 @@ class VertexAIOptions(BaseModel): + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + - params: Dict[str, Any] = { + - "model": self.options.model, + - "project_id": self.options.project_id, + - "location": self.options.location, + - "adc_credentials_string": self.options.adc_credentials_string, + - } + - + + # additional_params spread first so that explicit fields always win, + + # matching the TypeScript SDK. + + params: Dict[str, Any] = dict(self.options.additional_params or {}) + + params["model"] = self.options.model + + params["project_id"] = self.options.project_id + + params["location"] = self.options.location + + params["adc_credentials_string"] = self.options.adc_credentials_string + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + - if self.options.additional_params is not None: + - params.update(self.options.additional_params) + + config: Dict[str, Any] = { + "vendor": "vertexai", + @@ -119,12 +193,8 @@ class VertexAI(BaseMLLM): + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + - if self.options.predefined_tools is not None: + - config["predefined_tools"] = self.options.predefined_tools + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + - if self.options.max_history is not None: + - config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + @@ -145,9 +215,7 @@ class GeminiLiveOptions(BaseModel): + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + @@ -179,12 +247,8 @@ class GeminiLive(BaseMLLM): + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + - if self.options.predefined_tools is not None: + - config["predefined_tools"] = self.options.predefined_tools + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + - if self.options.max_history is not None: + - config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/vendors/llm.py: | + from typing import Any, Dict, List, Optional, Union + + from pydantic import BaseModel, ConfigDict, Field + + from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + StartAgentsRequestPropertiesLlmGreetingConfigs, + ) + from .base import BaseLLM + + LlmGreetingConfigs = Union[StartAgentsRequestPropertiesLlmGreetingConfigs, Dict[str, Any]] + + + def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Ensure each MCP server has transport set (API requires it). Default to streamable_http.""" + result = [] + for s in servers: + item = dict(s) + if item.get("transport") is None: + item["transport"] = "streamable_http" + result.append(item) + return result + + + def _dump_optional_model(value: Any) -> Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + class OpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: Optional[str] = Field(default=None, description="OpenAI API key") + model: str = Field(default="gpt-4o-mini", description="Model name") + base_url: Optional[str] = Field(default=None, description="Custom base URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + max_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class OpenAI(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # model is the default; explicit params entries extend/override it. + # This matches the TS SDK behaviour: { model, ...params }. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + + # Named fields take precedence over anything in the generic params dict. + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.base_url or "https://api.openai.com/v1/chat/completions", + "params": params, + "style": "openai", + "input_modalities": self.options.input_modalities or ["text"], + } + if self.options.api_key is not None: + config["api_key"] = self.options.api_key + if self.options.headers is not None: + config["headers"] = self.options.headers + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class AzureOpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Azure OpenAI API key") + endpoint: str = Field(..., description="Azure endpoint URL") + deployment_name: str = Field(..., description="Azure deployment name") + api_version: str = Field(default="2024-08-01-preview", description="Azure API version") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + max_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class AzureOpenAI(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AzureOpenAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + url = ( + f"{self.options.endpoint}/openai/deployments/" + f"{self.options.deployment_name}/chat/completions" + f"?api-version={self.options.api_version}" + ) + config: Dict[str, Any] = { + "url": url, + "api_key": self.options.api_key, + "vendor": self.options.vendor or "azure", + "style": "openai", + "input_modalities": self.options.input_modalities or ["text"], + } + + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if params: + config["params"] = params + if self.options.headers is not None: + config["headers"] = self.options.headers + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class AnthropicOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anthropic API key") + model: str = Field(default="claude-3-5-sonnet-20241022", description="Model name") + url: Optional[str] = Field(default=None, description="Custom API endpoint URL") + max_tokens: Optional[int] = Field(default=None, gt=0) + temperature: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class Anthropic(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AnthropicOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.url or "https://api.anthropic.com/v1/messages", + "api_key": self.options.api_key, + "params": params, + "style": "anthropic", + "input_modalities": self.options.input_modalities or ["text"], + } + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class GeminiOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google AI API key") + model: str = Field(default="gemini-2.0-flash-exp", description="Model name") + url: Optional[str] = Field(default=None, description="Custom API endpoint URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_k: Optional[int] = Field(default=None, gt=0) + max_output_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class Gemini(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + if self.options.top_k is not None: + params["top_k"] = self.options.top_k + if self.options.max_output_tokens is not None: + params["max_output_tokens"] = self.options.max_output_tokens + + config: Dict[str, Any] = { + "url": self.options.url or "https://generativelanguage.googleapis.com/v1beta/models", + "api_key": self.options.api_key, + "params": params, + "style": "gemini", + "input_modalities": self.options.input_modalities or ["text"], + } + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + src/agora_agent/agentkit/vendors/mllm.py: | + import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + + from ...agents.types.start_agents_request_properties_mllm_turn_detection import ( + StartAgentsRequestPropertiesMllmTurnDetection, + ) + from .base import BaseMLLM + + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + + + class OpenAIRealtimeOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="OpenAI API key") + model: Optional[str] = Field(default=None, description="Model name (e.g., gpt-4o-realtime-preview)") + url: Optional[str] = Field(default=None, description="WebSocket URL") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIRealtimeOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config: Dict[str, Any] = { + "vendor": "openai", + "api_key": self.options.api_key, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.model is not None: + params = {"model": self.options.model} + if self.options.params is not None: + params.update(self.options.params) + config["params"] = params + elif self.options.params is not None: + config["params"] = self.options.params + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + # xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + # is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + class XaiGrokOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="xAI API key") + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + class XaiGrok(BaseMLLM): + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + def __init__(self, **kwargs: Any): + self.options = XaiGrokOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.language is not None: + params["language"] = self.options.language + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate + + config: Dict[str, Any] = { + "vendor": "xai", + "api_key": self.options.api_key, + "url": self.options.url, + "params": params, + } + + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class XaiRealtimeOptions(XaiGrokOptions): + """Deprecated: use :class:`XaiGrokOptions` instead.""" + + def __init__(self, **data: Any): + warnings.warn( + "XaiRealtimeOptions is deprecated; use XaiGrokOptions instead.", + DeprecationWarning, + stacklevel=2, + ) + super().__init__(**data) + + + class XaiRealtime(XaiGrok): + """Deprecated: use :class:`XaiGrok` instead.""" + + def __init__(self, **kwargs: Any): + warnings.warn( + "XaiRealtime is deprecated; use XaiGrok instead.", + DeprecationWarning, + stacklevel=2, + ) + super().__init__(**kwargs) + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + model: str = Field(..., description="Model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud location/region") + adc_credentials_string: str = Field(..., description="Application Default Credentials JSON string") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name (e.g., Aoede, Charon)") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # additional_params spread first so that explicit fields always win, + # matching the TypeScript SDK. + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["model"] = self.options.model + params["project_id"] = self.options.project_id + params["location"] = self.options.location + params["adc_credentials_string"] = self.options.adc_credentials_string + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "vertexai", + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class GeminiLiveOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google API key") + model: str = Field(..., description="Gemini Live model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiLiveOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = {} + if self.options.additional_params is not None: + params.update(self.options.additional_params) + params["model"] = self.options.model + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "gemini", + "api_key": self.options.api_key, + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + status: unresolved + - id: patch-26706d73 + content_hash: sha256:a9551e0b774b96e7734e9faa7d770611861cf443837428272ef75710447238da + original_commit: 26706d73ae15d860d57daf926837632c01be7f10 + original_message: "feat(agentkit): add GenericAvatar and session-aware avatar validation" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/avatar_types.py + - src/agora_agent/agentkit/vendors/avatar.py + patch_content: |+ + From 26706d73ae15d860d57daf926837632c01be7f10 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 20 May 2026 20:59:22 -0400 + Subject: [PATCH] feat(agentkit): add GenericAvatar and session-aware avatar + validation + + Adds the GenericAvatar vendor wrapper and extends avatar validation + helpers for generic and RTC-backed avatars. Session-derived fields such + as agora_appid, agora_channel, and agora_token can now be validated + after AgentSession enrichment. + --- + src/agora_agent/agentkit/avatar_types.py | 35 +++++++++++++++++- + src/agora_agent/agentkit/vendors/avatar.py | 42 ++++++++++++++++++++++ + 2 files changed, 76 insertions(+), 1 deletion(-) + + diff --git a/src/agora_agent/agentkit/avatar_types.py b/src/agora_agent/agentkit/avatar_types.py + index 9e132a9..a04809c 100644 + --- a/src/agora_agent/agentkit/avatar_types.py + +++ b/src/agora_agent/agentkit/avatar_types.py + @@ -17,7 +17,21 @@ def is_anam_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "anam" + + + -def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: + +def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + + return config.get("vendor") == "generic" + + + + + +def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + + params = config.get("params", {}) + + return isinstance(params, dict) and bool(params.get("agora_uid")) and ( + + is_heygen_avatar(config) or is_live_avatar_avatar(config) or is_generic_avatar(config) + + ) + + + + + +def validate_avatar_config( + + config: typing.Dict[str, typing.Any], + + require_session_fields: bool = False, + +) -> None: + """Validates avatar configuration at runtime. + + Parameters + @@ -45,6 +59,8 @@ def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: + f"Invalid quality for {label}: {params.get('quality')}. " + f"Must be one of: {', '.join(valid_qualities)}" + ) + + if require_session_fields and not params.get("agora_token"): + + raise ValueError(f"{label} avatar requires agora_token after session enrichment") + elif is_akool_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + @@ -53,6 +69,23 @@ def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Anam avatar requires api_key") + + elif is_generic_avatar(config): + + params = config.get("params", {}) + + if not params.get("api_key"): + + raise ValueError("Generic avatar requires api_key") + + if not params.get("api_base_url"): + + raise ValueError("Generic avatar requires api_base_url") + + if not params.get("avatar_id"): + + raise ValueError("Generic avatar requires avatar_id") + + if not params.get("agora_uid"): + + raise ValueError("Generic avatar requires agora_uid") + + if require_session_fields: + + if not params.get("agora_token"): + + raise ValueError("Generic avatar requires agora_token after session enrichment") + + if not params.get("agora_appid"): + + raise ValueError("Generic avatar requires agora_appid after session enrichment") + + if not params.get("agora_channel"): + + raise ValueError("Generic avatar requires agora_channel after session enrichment") + + + def validate_tts_sample_rate( + diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py + index b83a356..00cad8f 100644 + --- a/src/agora_agent/agentkit/vendors/avatar.py + +++ b/src/agora_agent/agentkit/vendors/avatar.py + @@ -132,6 +132,48 @@ class LiveAvatarAvatar(BaseAvatar): + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + +class GenericAvatarOptions(BaseModel): + + model_config = ConfigDict(extra="forbid") + + + + api_key: str = Field(..., description="Generic avatar provider API key") + + api_base_url: str = Field(..., description="Avatar provider API base URL") + + avatar_id: str = Field(..., description="Avatar ID") + + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + +class GenericAvatar(BaseAvatar): + + def __init__(self, **kwargs: Any): + + self.options = GenericAvatarOptions(**kwargs) + + + + @property + + def required_sample_rate(self) -> int: + + return 0 + + + + def to_config(self) -> Dict[str, Any]: + + params: Dict[str, Any] = { + + "api_key": self.options.api_key, + + "api_base_url": self.options.api_base_url, + + "avatar_id": self.options.avatar_id, + + "agora_uid": self.options.agora_uid, + + } + + + + if self.options.agora_appid is not None: + + params["agora_appid"] = self.options.agora_appid + + if self.options.agora_token is not None: + + params["agora_token"] = self.options.agora_token + + if self.options.agora_channel is not None: + + params["agora_channel"] = self.options.agora_channel + + if self.options.additional_params is not None: + + params = {**self.options.additional_params, **params} + + + + enable = self.options.enable if self.options.enable is not None else True + + return {"enable": enable, "vendor": "generic", "params": params} + + + + + class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/avatar_types.py: | + import typing + + + def is_heygen_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "heygen" + + + def is_live_avatar_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "liveavatar" + + + def is_akool_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "akool" + + + def is_anam_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "anam" + + + def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "generic" + + + def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + params = config.get("params", {}) + return isinstance(params, dict) and bool(params.get("agora_uid")) and ( + is_heygen_avatar(config) or is_live_avatar_avatar(config) or is_generic_avatar(config) + ) + + + def validate_avatar_config( + config: typing.Dict[str, typing.Any], + require_session_fields: bool = False, + ) -> None: + """Validates avatar configuration at runtime. + + Parameters + ---------- + config : dict + The avatar configuration dictionary. + + Raises + ------ + ValueError + If the configuration is invalid. + """ + if is_heygen_avatar(config) or is_live_avatar_avatar(config): + label = "HeyGen" if is_heygen_avatar(config) else "LiveAvatar" + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError(f"{label} avatar requires api_key") + if not params.get("quality"): + raise ValueError(f"{label} avatar requires quality (low, medium, or high)") + if not params.get("agora_uid"): + raise ValueError(f"{label} avatar requires agora_uid") + valid_qualities = ("low", "medium", "high") + if params.get("quality") not in valid_qualities: + raise ValueError( + f"Invalid quality for {label}: {params.get('quality')}. " + f"Must be one of: {', '.join(valid_qualities)}" + ) + if require_session_fields and not params.get("agora_token"): + raise ValueError(f"{label} avatar requires agora_token after session enrichment") + elif is_akool_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Akool avatar requires api_key") + elif is_anam_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Anam avatar requires api_key") + elif is_generic_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Generic avatar requires api_key") + if not params.get("api_base_url"): + raise ValueError("Generic avatar requires api_base_url") + if not params.get("avatar_id"): + raise ValueError("Generic avatar requires avatar_id") + if not params.get("agora_uid"): + raise ValueError("Generic avatar requires agora_uid") + if require_session_fields: + if not params.get("agora_token"): + raise ValueError("Generic avatar requires agora_token after session enrichment") + if not params.get("agora_appid"): + raise ValueError("Generic avatar requires agora_appid after session enrichment") + if not params.get("agora_channel"): + raise ValueError("Generic avatar requires agora_channel after session enrichment") + + + def validate_tts_sample_rate( + avatar_config: typing.Dict[str, typing.Any], + tts_sample_rate: int, + ) -> None: + """Validates that TTS sample rate is compatible with the avatar vendor. + + Different avatar vendors have specific sample rate requirements: + - HeyGen: ONLY supports 24,000 Hz + - Akool: ONLY supports 16,000 Hz + + Parameters + ---------- + avatar_config : dict + The avatar configuration dictionary. + tts_sample_rate : int + The sample rate from your TTS configuration (in Hz). + + Raises + ------ + ValueError + If TTS sample rate is incompatible with the avatar vendor. + """ + if is_heygen_avatar(avatar_config) or is_live_avatar_avatar(avatar_config): + if tts_sample_rate != 24000: + label = "HeyGen" if is_heygen_avatar(avatar_config) else "LiveAvatar" + raise ValueError( + f"{label} avatars ONLY support 24,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 24kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/overview" + ) + elif is_akool_avatar(avatar_config): + if tts_sample_rate != 16000: + raise ValueError( + f"Akool avatars ONLY support 16,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 16kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/akool" + ) + src/agora_agent/agentkit/vendors/avatar.py: | + import warnings + from typing import Any, Dict, Optional + + from pydantic import BaseModel, ConfigDict, Field, field_validator + + from .base import BaseAvatar + + HEYGEN_SAMPLE_RATE = 24000 + LIVEAVATAR_SAMPLE_RATE = 24000 + AKOOL_SAMPLE_RATE = 16000 + + + class HeyGenAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="HeyGen API key") + quality: str = Field(..., description="Avatar quality: low, medium, or high") + agora_uid: str = Field(..., description="Agora UID for the avatar stream") + agora_token: Optional[str] = Field(default=None, description="RTC token for avatar authentication") + avatar_id: Optional[str] = Field(default=None, description="HeyGen avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + disable_idle_timeout: Optional[bool] = Field(default=None, description="Whether to disable idle timeout") + activity_idle_timeout: Optional[int] = Field(default=None, description="Idle timeout in seconds") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + @field_validator("quality") + @classmethod + def validate_quality(cls, v: str) -> str: + valid = ("low", "medium", "high") + if v not in valid: + raise ValueError(f"Invalid quality '{v}'. Must be one of: {', '.join(valid)}") + return v + + class HeyGenAvatar(BaseAvatar): + """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + def __init__(self, **kwargs: Any): + warnings.warn( + "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + DeprecationWarning, + stacklevel=2, + ) + self.options = HeyGenAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return HEYGEN_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "heygen", "params": params} + + + class AkoolAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Akool API key") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + class AkoolAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AkoolAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return AKOOL_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "akool", "params": params} + + + class LiveAvatarAvatarOptions(HeyGenAvatarOptions): + pass + + + class LiveAvatarAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = LiveAvatarAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return LIVEAVATAR_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + class GenericAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Generic avatar provider API key") + api_base_url: str = Field(..., description="Avatar provider API base URL") + avatar_id: str = Field(..., description="Avatar ID") + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "api_base_url": self.options.api_base_url, + "avatar_id": self.options.avatar_id, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_appid is not None: + params["agora_appid"] = self.options.agora_appid + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.agora_channel is not None: + params["agora_channel"] = self.options.agora_channel + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "generic", "params": params} + + + class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anam API key") + persona_id: Optional[str] = Field(default=None, description="Persona ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + class AnamAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AnamAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.persona_id is not None: + params["persona_id"] = self.options.persona_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "anam", "params": params} + status: unresolved + - id: patch-9f491c63 + content_hash: sha256:d9811b2c5927be74f2125444dcf36642b88ad7be422019688cb0228093dce1d0 + original_commit: 9f491c63a964c13c67ba4af3708379e1b75a92d8 + original_message: "feat(agentkit): update Agent builder and session lifecycle for v2.7" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + patch_content: |+ + From 9f491c63a964c13c67ba4af3708379e1b75a92d8 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 20 May 2026 21:00:58 -0400 + Subject: [PATCH] feat(agentkit): update Agent builder and session lifecycle + for v2.7 + + Aligns Agent and AgentSession with the generated v2.7 request shape. + MLLM sessions no longer require TTS, LLM, or STT, and enabled avatars + are rejected when MLLM is configured. AgentSession now enriches generic + and RTC avatars with session context, auto-generates avatar tokens, + validates TTS sample rates from vendor-specific fields, and adds + paginated get_turns/get_all_turns helpers with fail-fast pagination + guards. + --- + src/agora_agent/agentkit/agent.py | 164 +++++++++++++-- + src/agora_agent/agentkit/agent_session.py | 231 ++++++++++++++++++++-- + 2 files changed, 360 insertions(+), 35 deletions(-) + + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index 70a1bdd..86a958e 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -8,6 +8,24 @@ if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + +from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + +from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + +from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + +from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + +from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + +from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + +from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + +from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + +from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + +from ..agents.types.get_agents_response import GetAgentsResponse + +from ..agents.types.list_agents_response import ListAgentsResponse + +from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + +from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + +from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + +from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + +from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + +from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + +from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + @@ -46,10 +64,21 @@ from ..agents.types.start_agents_request_properties_filler_words_trigger import + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + +from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + +from ..types.tts import Tts + from .token import generate_convo_ai_token, _validate_expires_in + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + +LlmConfig = StartAgentsRequestPropertiesLlm + +LlmStyle = StartAgentsRequestPropertiesLlmStyle + +SttConfig = StartAgentsRequestPropertiesAsr + +SttVendor = StartAgentsRequestPropertiesAsrVendor + +TtsConfig = Tts + +MllmConfig = StartAgentsRequestPropertiesMllm + +MllmVendor = StartAgentsRequestPropertiesMllmVendor + +AvatarConfig = StartAgentsRequestPropertiesAvatar + +AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + @@ -93,6 +122,18 @@ InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + +AgentConfig = StartAgentsRequestProperties + +AgentConfigUpdate = UpdateAgentsRequestProperties + +SessionInfo = GetAgentsResponse + +SessionListResponse = ListAgentsResponse + +SessionSummary = ListAgentsResponseDataListItem + +ConversationHistory = GetHistoryAgentsResponse + +ConversationTurn = GetHistoryAgentsResponseContentsItem + +ConversationRole = GetHistoryAgentsResponseContentsItemRole + +ConversationTurns = GetTurnsAgentsResponse + +ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + +SpeakPriority = SpeakAgentsRequestPriority + +Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + @@ -116,6 +157,7 @@ FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + +FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + + class Agent: + @@ -183,9 +225,20 @@ class Agent: + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + + sample_rate = vendor.sample_rate + + if ( + + self._avatar_required_sample_rate not in (None, 0) + + and sample_rate is not None + + and sample_rate != self._avatar_required_sample_rate + + ): + + raise ValueError( + + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + + f"but TTS is configured with {sample_rate} Hz. " + + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + - new_agent._tts_sample_rate = vendor.sample_rate + + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + @@ -194,6 +247,9 @@ class Agent: + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + + # Note: avatars are not supported with MLLM. The combination is rejected + + # at ``to_properties`` / ``AgentSession.start`` so callers can still + + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + @@ -202,7 +258,10 @@ class Agent: + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + - advanced_features_model = new_agent._advanced_features.model_copy(update={"enable_mllm": None}) + + advanced_features_model = self._copy_model_update( + + new_agent._advanced_features, + + {"enable_mllm": None}, + + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + @@ -214,6 +273,10 @@ class Agent: + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + + # Note: avatars are not supported with MLLM. The combination is rejected + + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + + # enabled) so callers may still combine the two for testing or for the + + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + @@ -282,7 +345,10 @@ class Agent: + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + - new_agent._advanced_features = new_agent._advanced_features.model_copy(update={"enable_tools": enabled}) + + new_agent._advanced_features = self._copy_model_update( + + new_agent._advanced_features, + + {"enable_tools": enabled}, + + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + @@ -294,6 +360,23 @@ class Agent: + new_agent._parameters = parameters + return new_agent + + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + + """Returns a new Agent with the specified RTC audio scenario.""" + + new_agent = self._clone() + + if new_agent._parameters is None: + + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + + elif isinstance(new_agent._parameters, dict): + + new_agent._parameters = typing.cast( + + SessionParamsInput, + + {**new_agent._parameters, "audio_scenario": audio_scenario}, + + ) + + else: + + new_agent._parameters = self._copy_model_update( + + new_agent._parameters, + + {"audio_scenario": audio_scenario}, + + ) + + return new_agent + + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + @@ -342,6 +425,33 @@ class Agent: + new_agent._filler_words = filler_words + return new_agent + + + @staticmethod + + def _field_value(value: typing.Any, field: str) -> typing.Any: + + if value is None: + + return None + + if isinstance(value, dict): + + return value.get(field) + + return getattr(value, field, None) + + + + @staticmethod + + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + + if hasattr(value, "model_copy"): + + return value.model_copy(update=update) + + if hasattr(value, "copy"): + + return value.copy(update=update) + + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + + data_channel = self._field_value(self._parameters, "data_channel") + + if not enable_rtm or data_channel is not None: + + return self._parameters + + if self._parameters is None: + + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + + if isinstance(self._parameters, dict): + + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + + @property + def name(self) -> typing.Optional[str]: + return self._name + @@ -354,6 +464,10 @@ class Agent: + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + + @property + + def tts_sample_rate(self) -> typing.Optional[int]: + + return self._tts_sample_rate + + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + @@ -536,6 +650,20 @@ class Agent: + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + + # Validate the MLLM + enabled-avatar combination BEFORE generating the + + # RTC token so callers get a clear, actionable error first (matches the + + # TypeScript and Go SDKs' fail-fast contract). + + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + + avatar_enabled = ( + + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + + ) + + if is_mllm_mode and avatar_enabled: + + raise ValueError( + + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + + ) + + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + @@ -553,9 +681,6 @@ class Agent: + **token_kwargs, + ) + + - mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + - is_mllm_mode = bool(mllm_flag or self._mllm is not None) + - + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + @@ -579,11 +704,12 @@ class Agent: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + - if self._parameters is not None: + - if isinstance(self._parameters, dict): + - base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**self._parameters) + + parameters = self._resolved_parameters() + + if parameters is not None: + + if isinstance(parameters, dict): + + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + - base_kwargs["parameters"] = self._parameters + + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + @@ -596,12 +722,10 @@ class Agent: + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + - if self._greeting: + + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + - if self._failure_message: + + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + - if self._max_history is not None: + - mllm_config.setdefault("max_history", self._max_history) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + @@ -617,14 +741,14 @@ class Agent: + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + - if self._instructions: + + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + - if self._greeting: + - llm_config.setdefault("greeting_message", self._greeting) + - if self._failure_message: + - llm_config.setdefault("failure_message", self._failure_message) + + if self._greeting is not None: + + llm_config["greeting_message"] = self._greeting + + if self._failure_message is not None: + + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + - llm_config.setdefault("max_history", self._max_history) + + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index 2408659..e41a399 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -14,13 +14,16 @@ from ..agent_management.types.agent_think_agent_management_request_on_thinking_a + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + +from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + @@ -182,17 +185,29 @@ class _AgentSessionBase: + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + + if self._is_mllm_mode(): + + raise ValueError( + + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + - sample_rate = tts_params.get("sample_rate") if isinstance(tts_params, dict) else None + + sample_rate = self._agent.tts_sample_rate + + if sample_rate is None and isinstance(tts_params, dict): + + sample_rate = ( + + tts_params.get("sample_rate") + + or tts_params.get("sample_rate_hertz") + + or tts_params.get("samplingRate") + + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + @@ -211,6 +226,50 @@ class _AgentSessionBase: + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + + avatar = properties.get("avatar") + + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + + return + + + + params = avatar.get("params") + + if not isinstance(params, dict): + + params = {} + + avatar["params"] = params + + + + if is_generic_avatar(avatar): + + if not params.get("agora_appid"): + + params["agora_appid"] = self._app_id + + if not params.get("agora_channel"): + + params["agora_channel"] = self._channel + + + + if not is_rtc_avatar(avatar): + + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + + return + + + + if not params.get("agora_token"): + + if not self._app_certificate: + + raise ValueError( + + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + + ) + + token_kwargs: typing.Dict[str, typing.Any] = {} + + if self._expires_in is not None: + + token_kwargs["token_expire"] = self._expires_in + + params["agora_token"] = generate_convo_ai_token( + + app_id=self._app_id, + + app_certificate=self._app_certificate, + + channel_name=self._channel, + + account=str(params["agora_uid"]), + + **token_kwargs, + + ) + + + + if str(params.get("agora_uid")) == self._agent_uid: + + self._warn( + + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + + ) + + + + validate_avatar_config(avatar, require_session_fields=True) + + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + @@ -238,12 +297,17 @@ class _AgentSessionBase: + **token_opts, + ) + properties = self._dump_model(base_properties) + + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + - mllm = dict(self._agent.mllm) + - if self._agent.greeting: + + mllm = self._dump_model(self._agent.mllm) + + if not isinstance(mllm, dict): + + mllm = {} + + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + + if self._agent.failure_message is not None: + + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + @@ -251,20 +315,41 @@ class _AgentSessionBase: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + - if self._agent.instructions: + + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + - if self._agent.greeting: + - llm.setdefault("greeting_message", self._agent.greeting) + - if self._agent.failure_message: + - llm.setdefault("failure_message", self._agent.failure_message) + + if self._agent.greeting is not None: + + llm["greeting_message"] = self._agent.greeting + + if self._agent.failure_message is not None: + + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + - llm.setdefault("max_history", self._agent.max_history) + + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + + @staticmethod + + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + + if pagination is None: + + return None + + if isinstance(pagination, dict): + + return pagination.get(field) + + return getattr(pagination, field, None) + + + + @staticmethod + + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + + return list(turns or []) + + + + @classmethod + + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + + data = cls._dump_model(first_response) + + if not isinstance(data, dict): + + data = {} + + data["turns"] = turns + + return GetTurnsAgentsResponse(**data) + + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + @@ -484,7 +569,12 @@ class AgentSession(_AgentSessionBase): + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + - """Inject a custom text instruction into the current session pipeline.""" + + """Inject a custom text instruction into the current session pipeline. + + + + In API v2.7, omitting ``on_listening_action`` uses the server default + + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + + preserve the pre-v2.7 behavior. + + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + @@ -547,15 +637,68 @@ class AgentSession(_AgentSessionBase): + self._app_id, self._agent_id, request_options=self._request_options() + ) + + - def get_turns(self) -> typing.Any: + + def get_turns( + + self, + + *, + + page_index: typing.Optional[int] = None, + + page_size: typing.Optional[int] = None, + + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_turns( + - self._app_id, self._agent_id, request_options=self._request_options() + + self._app_id, + + self._agent_id, + + page_index=page_index, + + page_size=page_size, + + request_options=self._request_options(), + ) + + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + + """Get all turn analytics pages for this session. + + + + Raises ``RuntimeError`` if the server's pagination metadata is missing + + the fields required to advance, or if requesting the next page returns + + a page index that did not advance. + + """ + + response = self.get_turns(page_index=1, page_size=page_size) + + all_turns = self._response_turns(response) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + current_page = self._page_value(pagination, "page_index") or 1 + + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + + total_pages = self._page_value(pagination, "total_pages") + + returned_index = self._page_value(pagination, "page_index") + + if returned_index is None and total_pages is None: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + if total_pages is not None and current_page >= total_pages: + + break + + next_page = current_page + 1 + + response = self.get_turns(page_index=next_page, page_size=page_size) + + all_turns.extend(self._response_turns(response)) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + returned_index = self._page_value(pagination, "page_index") if pagination else None + + if returned_index is not None: + + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + + raise RuntimeError( + + f"get_all_turns pagination did not advance: requested page {next_page}, " + + f"received page {returned_index}." + + ) + + current_page = returned_index + + else: + + total_pages = self._page_value(pagination, "total_pages") if pagination else None + + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + + if total_pages is None and is_last_page is not True: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + current_page = next_page + + return self._with_all_turns(response, all_turns) + + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + @@ -734,7 +877,12 @@ class AsyncAgentSession(_AgentSessionBase): + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + - """Inject a custom text instruction into the current session pipeline.""" + + """Inject a custom text instruction into the current session pipeline. + + + + In API v2.7, omitting ``on_listening_action`` uses the server default + + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + + preserve the pre-v2.7 behavior. + + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + @@ -797,11 +945,64 @@ class AsyncAgentSession(_AgentSessionBase): + self._app_id, self._agent_id, request_options=self._request_options() + ) + + - async def get_turns(self) -> typing.Any: + + async def get_turns( + + self, + + *, + + page_index: typing.Optional[int] = None, + + page_size: typing.Optional[int] = None, + + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_turns( + - self._app_id, self._agent_id, request_options=self._request_options() + + self._app_id, + + self._agent_id, + + page_index=page_index, + + page_size=page_size, + + request_options=self._request_options(), + ) + + + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + + """Get all turn analytics pages for this session. + + + + Raises ``RuntimeError`` if the server's pagination metadata is missing + + the fields required to advance, or if requesting the next page returns + + a page index that did not advance. + + """ + + response = await self.get_turns(page_index=1, page_size=page_size) + + all_turns = self._response_turns(response) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + current_page = self._page_value(pagination, "page_index") or 1 + + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + + total_pages = self._page_value(pagination, "total_pages") + + returned_index = self._page_value(pagination, "page_index") + + if returned_index is None and total_pages is None: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + if total_pages is not None and current_page >= total_pages: + + break + + next_page = current_page + 1 + + response = await self.get_turns(page_index=next_page, page_size=page_size) + + all_turns.extend(self._response_turns(response)) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + returned_index = self._page_value(pagination, "page_index") if pagination else None + + if returned_index is not None: + + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + + raise RuntimeError( + + f"get_all_turns pagination did not advance: requested page {next_page}, " + + f"received page {returned_index}." + + ) + + current_page = returned_index + + else: + + total_pages = self._page_value(pagination, "total_pages") if pagination else None + + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + + if total_pages is None and is_last_page is not True: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + current_page = next_page + + return self._with_all_turns(response, all_turns) + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection + from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode + from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs + from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode + from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from .token import generate_convo_ai_token, _validate_expires_in + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + MllmVendor = StartAgentsRequestPropertiesMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + # LLM sub-type aliases + LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs + LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode + McpServersItem = StartAgentsRequestPropertiesLlmMcpServersItem + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Examples + -------- + >>> from agora_agent.agentkit import Agent + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + ): + self._name = name + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + The failure message is played via TTS when the LLM call fails. + """ + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Returns a new Agent with the specified maximum conversation history length.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + account=agent_uid, + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + if self._stt is not None: + base_kwargs["asr"] = self._stt + + return StartAgentsRequestProperties(**base_kwargs) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + account=self._agent_uid, + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_rtc_avatar(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + account=str(params["agora_uid"]), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + status: unresolved + - id: patch-eaec58eb + content_hash: sha256:8390ced175326080fc76021a97d315e71229bbc9ad70eef35a63eb9968df7830 + original_commit: eaec58eb2edfe03b1311a32dd137a867edf5d096 + original_message: "refactor(agentkit): align deprecated vendor aliases with canonical names" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/vendors/__init__.py + - src/agora_agent/agentkit/vendors/avatar.py + - src/agora_agent/agentkit/vendors/mllm.py + patch_content: |+ + diff --git a/src/agora_agent/agentkit/vendors/__init__.py b/src/agora_agent/agentkit/vendors/__init__.py + index 689eab1..8e2042e 100644 + --- a/src/agora_agent/agentkit/vendors/__init__.py + +++ b/src/agora_agent/agentkit/vendors/__init__.py + @@ -13,7 +13,7 @@ from .base import ( + ) + from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + -from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok, XaiRealtime + +from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok + from .stt import ( + AmazonSTT, + AresSTT, + @@ -83,7 +83,6 @@ __all__ = [ + "GeminiLive", + "VertexAI", + "XaiGrok", + - "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py + index 00cad8f..50bdd08 100644 + --- a/src/agora_agent/agentkit/vendors/avatar.py + +++ b/src/agora_agent/agentkit/vendors/avatar.py + @@ -5,19 +5,19 @@ from pydantic import BaseModel, ConfigDict, Field, field_validator + + from .base import BaseAvatar + + -HEYGEN_SAMPLE_RATE = 24000 + LIVEAVATAR_SAMPLE_RATE = 24000 + +HEYGEN_SAMPLE_RATE = LIVEAVATAR_SAMPLE_RATE + AKOOL_SAMPLE_RATE = 16000 + + + -class HeyGenAvatarOptions(BaseModel): + +class LiveAvatarAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + - api_key: str = Field(..., description="HeyGen API key") + + api_key: str = Field(..., description="LiveAvatar API key") + quality: str = Field(..., description="Avatar quality: low, medium, or high") + agora_uid: str = Field(..., description="Agora UID for the avatar stream") + agora_token: Optional[str] = Field(default=None, description="RTC token for avatar authentication") + - avatar_id: Optional[str] = Field(default=None, description="HeyGen avatar ID") + + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + disable_idle_timeout: Optional[bool] = Field(default=None, description="Whether to disable idle timeout") + activity_idle_timeout: Optional[int] = Field(default=None, description="Idle timeout in seconds") + @@ -31,20 +31,14 @@ class HeyGenAvatarOptions(BaseModel): + raise ValueError(f"Invalid quality '{v}'. Must be one of: {', '.join(valid)}") + return v + + -class HeyGenAvatar(BaseAvatar): + - """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + +class LiveAvatarAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + - warnings.warn( + - "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + - DeprecationWarning, + - stacklevel=2, + - ) + - self.options = HeyGenAvatarOptions(**kwargs) + + self.options = LiveAvatarAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + - return HEYGEN_SAMPLE_RATE + + return LIVEAVATAR_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + @@ -65,71 +59,79 @@ class HeyGenAvatar(BaseAvatar): + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + - return {"enable": enable, "vendor": "heygen", "params": params} + + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + -class AkoolAvatarOptions(BaseModel): + - model_config = ConfigDict(extra="forbid") + +class HeyGenAvatarOptions(LiveAvatarAvatarOptions): + + """Deprecated: use :class:`LiveAvatarAvatarOptions` instead.""" + + - api_key: str = Field(..., description="Akool API key") + - avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + - enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + - additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + -class AkoolAvatar(BaseAvatar): + +class HeyGenAvatar(BaseAvatar): + + """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + + def __init__(self, **kwargs: Any): + - self.options = AkoolAvatarOptions(**kwargs) + + warnings.warn( + + "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + + DeprecationWarning, + + stacklevel=2, + + ) + + self.options = HeyGenAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + - return AKOOL_SAMPLE_RATE + + return HEYGEN_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + + "quality": self.options.quality, + + "agora_uid": self.options.agora_uid, + } + + + if self.options.agora_token is not None: + + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + + if self.options.disable_idle_timeout is not None: + + params["disable_idle_timeout"] = self.options.disable_idle_timeout + + if self.options.activity_idle_timeout is not None: + + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + - return {"enable": enable, "vendor": "akool", "params": params} + + return {"enable": enable, "vendor": "heygen", "params": params} + + + -class LiveAvatarAvatarOptions(HeyGenAvatarOptions): + - pass + +class AkoolAvatarOptions(BaseModel): + + model_config = ConfigDict(extra="forbid") + + + + api_key: str = Field(..., description="Akool API key") + + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + -class LiveAvatarAvatar(BaseAvatar): + +class AkoolAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + - self.options = LiveAvatarAvatarOptions(**kwargs) + + self.options = AkoolAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + - return LIVEAVATAR_SAMPLE_RATE + + return AKOOL_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + - "quality": self.options.quality, + - "agora_uid": self.options.agora_uid, + } + + - if self.options.agora_token is not None: + - params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + - if self.options.disable_idle_timeout is not None: + - params["disable_idle_timeout"] = self.options.disable_idle_timeout + - if self.options.activity_idle_timeout is not None: + - params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + - return {"enable": enable, "vendor": "liveavatar", "params": params} + + return {"enable": enable, "vendor": "akool", "params": params} + + + class GenericAvatarOptions(BaseModel): + @@ -145,6 +147,7 @@ class GenericAvatarOptions(BaseModel): + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + + class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + @@ -178,10 +181,11 @@ class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anam API key") + - persona_id: Optional[str] = Field(default=None, description="Persona ID") + + persona_id: Optional[str] = Field(default=None, description="Anam persona ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + + class AnamAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AnamAvatarOptions(**kwargs) + diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py + index cd6cd07..b58f040 100644 + --- a/src/agora_agent/agentkit/vendors/mllm.py + +++ b/src/agora_agent/agentkit/vendors/mllm.py + @@ -1,4 +1,3 @@ + -import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + @@ -119,30 +118,6 @@ class XaiGrok(BaseMLLM): + return config + + + -class XaiRealtimeOptions(XaiGrokOptions): + - """Deprecated: use :class:`XaiGrokOptions` instead.""" + - + - def __init__(self, **data: Any): + - warnings.warn( + - "XaiRealtimeOptions is deprecated; use XaiGrokOptions instead.", + - DeprecationWarning, + - stacklevel=2, + - ) + - super().__init__(**data) + - + - + -class XaiRealtime(XaiGrok): + - """Deprecated: use :class:`XaiGrok` instead.""" + - + - def __init__(self, **kwargs: Any): + - warnings.warn( + - "XaiRealtime is deprecated; use XaiGrok instead.", + - DeprecationWarning, + - stacklevel=2, + - ) + - super().__init__(**kwargs) + - + - + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + theirs_snapshot: + src/agora_agent/agentkit/vendors/__init__.py: | + from .base import ( + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + ElevenLabsSampleRate, + GoogleTTSSampleRate, + MicrosoftSampleRate, + OpenAISampleRate, + SampleRate, + ) + from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok + from .stt import ( + AmazonSTT, + AresSTT, + AssemblyAISTT, + DeepgramSTT, + GoogleSTT, + MicrosoftSTT, + OpenAISTT, + SarvamSTT, + SpeechmaticsSTT, + ) + from .tts import ( + AmazonTTS, + CartesiaTTS, + DeepgramTTS, + ElevenLabsTTS, + FishAudioTTS, + GoogleTTS, + HumeAITTS, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAITTS, + RimeTTS, + SarvamTTS, + ) + + __all__ = [ + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "GoogleTTSSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + ] + src/agora_agent/agentkit/vendors/avatar.py: | + import warnings + from typing import Any, Dict, Optional + + from pydantic import BaseModel, ConfigDict, Field, field_validator + + from .base import BaseAvatar + + LIVEAVATAR_SAMPLE_RATE = 24000 + HEYGEN_SAMPLE_RATE = LIVEAVATAR_SAMPLE_RATE + AKOOL_SAMPLE_RATE = 16000 + + + class LiveAvatarAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="LiveAvatar API key") + quality: str = Field(..., description="Avatar quality: low, medium, or high") + agora_uid: str = Field(..., description="Agora UID for the avatar stream") + agora_token: Optional[str] = Field(default=None, description="RTC token for avatar authentication") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + disable_idle_timeout: Optional[bool] = Field(default=None, description="Whether to disable idle timeout") + activity_idle_timeout: Optional[int] = Field(default=None, description="Idle timeout in seconds") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + @field_validator("quality") + @classmethod + def validate_quality(cls, v: str) -> str: + valid = ("low", "medium", "high") + if v not in valid: + raise ValueError(f"Invalid quality '{v}'. Must be one of: {', '.join(valid)}") + return v + + + class LiveAvatarAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = LiveAvatarAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return LIVEAVATAR_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + class HeyGenAvatarOptions(LiveAvatarAvatarOptions): + """Deprecated: use :class:`LiveAvatarAvatarOptions` instead.""" + + + class HeyGenAvatar(BaseAvatar): + """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + def __init__(self, **kwargs: Any): + warnings.warn( + "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + DeprecationWarning, + stacklevel=2, + ) + self.options = HeyGenAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return HEYGEN_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "heygen", "params": params} + + + class AkoolAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Akool API key") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + class AkoolAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AkoolAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return AKOOL_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "akool", "params": params} + + + class GenericAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Generic avatar provider API key") + api_base_url: str = Field(..., description="Avatar provider API base URL") + avatar_id: str = Field(..., description="Avatar ID") + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "api_base_url": self.options.api_base_url, + "avatar_id": self.options.avatar_id, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_appid is not None: + params["agora_appid"] = self.options.agora_appid + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.agora_channel is not None: + params["agora_channel"] = self.options.agora_channel + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "generic", "params": params} + + + class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anam API key") + persona_id: Optional[str] = Field(default=None, description="Anam persona ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + class AnamAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AnamAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.persona_id is not None: + params["persona_id"] = self.options.persona_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "anam", "params": params} + src/agora_agent/agentkit/vendors/mllm.py: | + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + + from ...agents.types.start_agents_request_properties_mllm_turn_detection import ( + StartAgentsRequestPropertiesMllmTurnDetection, + ) + from .base import BaseMLLM + + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + + + class OpenAIRealtimeOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="OpenAI API key") + model: Optional[str] = Field(default=None, description="Model name (e.g., gpt-4o-realtime-preview)") + url: Optional[str] = Field(default=None, description="WebSocket URL") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIRealtimeOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config: Dict[str, Any] = { + "vendor": "openai", + "api_key": self.options.api_key, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.model is not None: + params = {"model": self.options.model} + if self.options.params is not None: + params.update(self.options.params) + config["params"] = params + elif self.options.params is not None: + config["params"] = self.options.params + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + # xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + # is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + class XaiGrokOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="xAI API key") + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + class XaiGrok(BaseMLLM): + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + def __init__(self, **kwargs: Any): + self.options = XaiGrokOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.language is not None: + params["language"] = self.options.language + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate + + config: Dict[str, Any] = { + "vendor": "xai", + "api_key": self.options.api_key, + "url": self.options.url, + "params": params, + } + + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + model: str = Field(..., description="Model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud location/region") + adc_credentials_string: str = Field(..., description="Application Default Credentials JSON string") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name (e.g., Aoede, Charon)") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # additional_params spread first so that explicit fields always win, + # matching the TypeScript SDK. + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["model"] = self.options.model + params["project_id"] = self.options.project_id + params["location"] = self.options.location + params["adc_credentials_string"] = self.options.adc_credentials_string + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "vertexai", + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class GeminiLiveOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google API key") + model: str = Field(..., description="Gemini Live model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiLiveOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = {} + if self.options.additional_params is not None: + params.update(self.options.additional_params) + params["model"] = self.options.model + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "gemini", + "api_key": self.options.api_key, + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + status: unresolved + - id: patch-20245632 + content_hash: sha256:a22e4a3b114ba8105c8129ccd6222570dc1f231daf9ac6037a00bcd4e11c425b + original_commit: 20245632afd066efe5a453665b29c5ba0e13e4f8 + original_message: "feat(agentkit): export type aliases and avatar token helpers" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/__init__.py + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + - src/agora_agent/agentkit/avatar_types.py + - src/agora_agent/agentkit/constants.py + patch_content: |+ + From 20245632afd066efe5a453665b29c5ba0e13e4f8 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Thu, 21 May 2026 15:17:27 -0400 + Subject: [PATCH] feat(agentkit): export type aliases and avatar token helpers + + --- + src/agora_agent/agentkit/__init__.py | 49 ++++++++++++++++------- + src/agora_agent/agentkit/agent.py | 22 +++++++++- + src/agora_agent/agentkit/agent_session.py | 8 +++- + src/agora_agent/agentkit/avatar_types.py | 23 +++++++++-- + src/agora_agent/agentkit/constants.py | 10 +++++ + 5 files changed, 90 insertions(+), 22 deletions(-) + + diff --git a/src/agora_agent/agentkit/__init__.py b/src/agora_agent/agentkit/__init__.py + index 5ceda66..e9ab221 100644 + --- a/src/agora_agent/agentkit/__init__.py + +++ b/src/agora_agent/agentkit/__init__.py + @@ -2,6 +2,7 @@ from .agent import ( + Agent, + AgentConfig, + AgentConfigUpdate, + + AsrConfig, + ConversationHistory, + ConversationRole, + ConversationSessionTurn, + @@ -62,23 +63,23 @@ from .agent import ( + SessionListResponse, + SessionSummary, + SpeakPriority, + + ThinkOnListeningAction, + + ThinkOnSpeakingAction, + + ThinkOnThinkingAction, + + ThinkResponse, + ) + -from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + -from ..agent_management.types.agent_think_agent_management_response import ( + - AgentThinkAgentManagementResponse as AgentThinkResponse, + -) + -from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + - AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + -) + -from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + - AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + -) + -from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + - AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + +# Deprecated think type aliases (prefer ThinkOn* names). + +from .agent import ( + + ThinkOnListeningAction as AgentThinkRequestOnListeningAction, + + ThinkOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + + ThinkOnThinkingAction as AgentThinkRequestOnThinkingAction, + + ThinkResponse as AgentThinkResponse, + ) + +from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + @@ -94,6 +95,13 @@ from .constants import ( + GeofenceArea, + GeofenceExcludeArea, + FillerWordsSelectionRule, + + ThinkOnListeningActionIgnore, + + ThinkOnListeningActionInject, + + ThinkOnListeningActionInterrupt, + + ThinkOnSpeakingActionIgnore, + + ThinkOnSpeakingActionInterrupt, + + ThinkOnThinkingActionIgnore, + + ThinkOnThinkingActionInterrupt, + TurnDetectionTypeValues, + ) + from .token import ( + @@ -158,7 +166,6 @@ from .vendors import ( + SpeechmaticsSTT, + VertexAI, + XaiGrok, + - XaiRealtime, + LiveAvatarAvatar, + ) + + @@ -172,6 +179,7 @@ __all__ = [ + "LlmConfig", + "LlmStyle", + "SttConfig", + + "AsrConfig", + "SttVendor", + "TtsConfig", + "MllmConfig", + @@ -230,6 +238,13 @@ __all__ = [ + "GeofenceExcludeArea", + "FillerWordsSelectionRule", + "TurnDetectionTypeValues", + + "ThinkOnListeningActionInject", + + "ThinkOnListeningActionInterrupt", + + "ThinkOnListeningActionIgnore", + + "ThinkOnThinkingActionInterrupt", + + "ThinkOnThinkingActionIgnore", + + "ThinkOnSpeakingActionInterrupt", + + "ThinkOnSpeakingActionIgnore", + # LLM sub-types + "LlmGreetingConfigs", + "LlmGreetingConfigsMode", + @@ -246,10 +261,16 @@ __all__ = [ + "ConversationTurns", + "ConversationSessionTurn", + "SpeakPriority", + + "ThinkResponse", + + "ThinkOnListeningAction", + + "ThinkOnThinkingAction", + + "ThinkOnSpeakingAction", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + "AgentThinkRequestOnSpeakingAction", + + "is_avatar_token_managed", + + "is_rtc_avatar", + "AgentPresets", + "DeepgramPresetModels", + "OpenAIPresetModels", + @@ -303,7 +324,6 @@ __all__ = [ + "GeminiLive", + "VertexAI", + "XaiGrok", + - "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + @@ -314,7 +334,6 @@ __all__ = [ + "is_akool_avatar", + "is_anam_avatar", + "is_generic_avatar", + - "is_rtc_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index 86a958e..14933a2 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -66,13 +66,25 @@ from ..agents.types.start_agents_request_properties_filler_words_content import + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + -from .token import generate_convo_ai_token, _validate_expires_in + +from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + + AgentThinkAgentManagementRequestOnListeningAction, + +) + +from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + + AgentThinkAgentManagementRequestOnThinkingAction, + +) + +from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + + AgentThinkAgentManagementRequestOnSpeakingAction, + +) + +from ..agent_management.types.agent_think_agent_management_response import ( + + AgentThinkAgentManagementResponse, + +) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + +AsrConfig = SttConfig + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + @@ -159,6 +171,14 @@ FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + +# Think type aliases and response + +ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + +ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + +ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + +ThinkResponse = AgentThinkAgentManagementResponse + + + +from .token import generate_convo_ai_token, _validate_expires_in + + + + class Agent: + """A reusable agent definition. + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index e41a399..269619e 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -20,10 +20,10 @@ from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + - is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + @@ -242,7 +242,11 @@ class _AgentSessionBase: + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + - if not is_rtc_avatar(avatar): + + if not is_avatar_token_managed(avatar): + + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + + return + + + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + diff --git a/src/agora_agent/agentkit/avatar_types.py b/src/agora_agent/agentkit/avatar_types.py + index a04809c..aea9da1 100644 + --- a/src/agora_agent/agentkit/avatar_types.py + +++ b/src/agora_agent/agentkit/avatar_types.py + @@ -1,3 +1,4 @@ + +import warnings + import typing + + + @@ -21,11 +22,25 @@ def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "generic" + + + +def is_avatar_token_managed(config: typing.Dict[str, typing.Any]) -> bool: + + """Return True when AgentKit manages the avatar RTC publisher identity.""" + + return ( + + is_heygen_avatar(config) + + or is_live_avatar_avatar(config) + + or is_generic_avatar(config) + + ) + + + + + def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + - params = config.get("params", {}) + - return isinstance(params, dict) and bool(params.get("agora_uid")) and ( + - is_heygen_avatar(config) or is_live_avatar_avatar(config) or is_generic_avatar(config) + + """Deprecated: use :func:`is_avatar_token_managed` for vendor gating.""" + + warnings.warn( + + "is_rtc_avatar is deprecated; use is_avatar_token_managed for vendor gating " + + "and keep agora_uid checks in session enrichment.", + + DeprecationWarning, + + stacklevel=2, + ) + + params = config.get("params", {}) + + return isinstance(params, dict) and bool(params.get("agora_uid")) and is_avatar_token_managed(config) + + + def validate_avatar_config( + @@ -95,7 +110,7 @@ def validate_tts_sample_rate( + """Validates that TTS sample rate is compatible with the avatar vendor. + + Different avatar vendors have specific sample rate requirements: + - - HeyGen: ONLY supports 24,000 Hz + + - HeyGen/LiveAvatar: ONLY supports 24,000 Hz + - Akool: ONLY supports 16,000 Hz + + Parameters + diff --git a/src/agora_agent/agentkit/constants.py b/src/agora_agent/agentkit/constants.py + index f86e4d3..c0a852e 100644 + --- a/src/agora_agent/agentkit/constants.py + +++ b/src/agora_agent/agentkit/constants.py + @@ -58,3 +58,13 @@ class TurnDetectionTypeValues: + AGORA_VAD = "agora_vad" + SERVER_VAD = "server_vad" + SEMANTIC_VAD = "semantic_vad" + + + + + +# Think action value constants (match Fern wire values) + +ThinkOnListeningActionInject = "inject" + +ThinkOnListeningActionInterrupt = "interrupt" + +ThinkOnListeningActionIgnore = "ignore" + +ThinkOnThinkingActionInterrupt = "interrupt" + +ThinkOnThinkingActionIgnore = "ignore" + +ThinkOnSpeakingActionInterrupt = "interrupt" + +ThinkOnSpeakingActionIgnore = "ignore" + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/__init__.py: | + from .agent import ( + Agent, + AgentConfig, + AgentConfigUpdate, + AsrConfig, + ConversationHistory, + ConversationRole, + ConversationSessionTurn, + ConversationTurn, + ConversationTurns, + StartAgentsRequestProperties, + AvatarConfig, + AvatarVendor, + GeofenceConfig, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, + RtcConfig, + SttConfig, + SttVendor, + TtsConfig, + FillerWordsConfig, + FillerWordsTrigger, + FillerWordsTriggerFixedTimeConfig, + FillerWordsContent, + FillerWordsContentStaticConfig, + FillerWordsContentSelectionRule, + TurnDetectionConfig, + TurnDetectionNestedConfig, + StartOfSpeechConfig, + StartOfSpeechMode, + StartOfSpeechVadConfig, + StartOfSpeechKeywordsConfig, + StartOfSpeechDisabledConfig, + StartOfSpeechDisabledConfigStrategy, + EndOfSpeechConfig, + EndOfSpeechMode, + EndOfSpeechVadConfig, + EndOfSpeechSemanticConfig, + TurnDetectionType, + InterruptMode, + Eagerness, + SalConfig, + SalMode, + AdvancedFeatures, + SessionParams, + SessionParamsInput, + SilenceConfig, + SilenceAction, + FarewellConfig, + ParametersDataChannel, + ParametersAudioScenario, + InterruptionConfig, + InterruptionMode, + MllmTurnDetectionConfig, + MllmTurnDetectionMode, + Labels, + LlmGreetingConfigs, + LlmGreetingConfigsMode, + McpServersItem, + SessionInfo, + SessionListResponse, + SessionSummary, + SpeakPriority, + ThinkOnListeningAction, + ThinkOnSpeakingAction, + ThinkOnThinkingAction, + ThinkResponse, + ) + # Deprecated think type aliases (prefer ThinkOn* names). + from .agent import ( + ThinkOnListeningAction as AgentThinkRequestOnListeningAction, + ThinkOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ThinkOnThinkingAction as AgentThinkRequestOnThinkingAction, + ThinkResponse as AgentThinkResponse, + ) + from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .constants import ( + DataChannel, + AudioScenario, + SilenceActionValues, + SalModeValues, + GeofenceArea, + GeofenceExcludeArea, + FillerWordsSelectionRule, + ThinkOnListeningActionIgnore, + ThinkOnListeningActionInject, + ThinkOnListeningActionInterrupt, + ThinkOnSpeakingActionIgnore, + ThinkOnSpeakingActionInterrupt, + ThinkOnThinkingActionIgnore, + ThinkOnThinkingActionInterrupt, + TurnDetectionTypeValues, + ) + from .token import ( + GenerateConvoAITokenOptions, + GenerateTokenOptions, + MAX_EXPIRY_SECONDS, + generate_convo_ai_token, + generate_rtc_token, + expires_in_hours, + expires_in_minutes, + ) + from .presets import ( + AgentPresets, + DeepgramPresetModels, + MiniMaxPresetModels, + OpenAIPresetModels, + OpenAITtsPresetModels, + normalize_preset_input, + ) + from .vendors import ( + AkoolAvatar, + AmazonSTT, + AmazonTTS, + AnamAvatar, + Anthropic, + AresSTT, + AssemblyAISTT, + AzureOpenAI, + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + CartesiaTTS, + DeepgramSTT, + DeepgramTTS, + ElevenLabsSampleRate, + ElevenLabsTTS, + FishAudioTTS, + Gemini, + GeminiLive, + GenericAvatar, + GoogleSTT, + GoogleTTS, + HeyGenAvatar, + HumeAITTS, + MicrosoftSampleRate, + MicrosoftSTT, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAI, + OpenAIRealtime, + OpenAISampleRate, + OpenAISTT, + OpenAITTS, + RimeTTS, + SampleRate, + SarvamSTT, + SarvamTTS, + SpeechmaticsSTT, + VertexAI, + XaiGrok, + LiveAvatarAvatar, + ) + + __all__ = [ + "Agent", + "AgentConfig", + "AgentConfigUpdate", + # Return type of Agent.to_properties() + "StartAgentsRequestProperties", + # Top-level config types + "LlmConfig", + "LlmStyle", + "SttConfig", + "AsrConfig", + "SttVendor", + "TtsConfig", + "MllmConfig", + "MllmVendor", + "AvatarConfig", + "AvatarVendor", + "GeofenceConfig", + "RtcConfig", + "FillerWordsConfig", + "FillerWordsTrigger", + "FillerWordsTriggerFixedTimeConfig", + "FillerWordsContent", + "FillerWordsContentStaticConfig", + "FillerWordsContentSelectionRule", + # Turn detection types + "TurnDetectionConfig", + "TurnDetectionNestedConfig", + "StartOfSpeechConfig", + "StartOfSpeechMode", + "StartOfSpeechVadConfig", + "StartOfSpeechKeywordsConfig", + "StartOfSpeechDisabledConfig", + "StartOfSpeechDisabledConfigStrategy", + "EndOfSpeechConfig", + "EndOfSpeechMode", + "EndOfSpeechVadConfig", + "EndOfSpeechSemanticConfig", + # Deprecated turn detection types + "TurnDetectionType", + "InterruptMode", + "Eagerness", + # SAL types + "SalConfig", + "SalMode", + # Advanced features + "AdvancedFeatures", + # Session parameters types + "SessionParams", + "SessionParamsInput", + "SilenceConfig", + "SilenceAction", + "FarewellConfig", + "ParametersDataChannel", + "ParametersAudioScenario", + "InterruptionConfig", + "InterruptionMode", + "MllmTurnDetectionConfig", + "MllmTurnDetectionMode", + "Labels", + # Type-safe constants + "DataChannel", + "AudioScenario", + "SilenceActionValues", + "SalModeValues", + "GeofenceArea", + "GeofenceExcludeArea", + "FillerWordsSelectionRule", + "TurnDetectionTypeValues", + "ThinkOnListeningActionInject", + "ThinkOnListeningActionInterrupt", + "ThinkOnListeningActionIgnore", + "ThinkOnThinkingActionInterrupt", + "ThinkOnThinkingActionIgnore", + "ThinkOnSpeakingActionInterrupt", + "ThinkOnSpeakingActionIgnore", + # LLM sub-types + "LlmGreetingConfigs", + "LlmGreetingConfigsMode", + "McpServersItem", + "AgentSession", + "AsyncAgentSession", + "AgentSessionOptions", + "SessionInfo", + "SessionListResponse", + "SessionSummary", + "ConversationHistory", + "ConversationTurn", + "ConversationRole", + "ConversationTurns", + "ConversationSessionTurn", + "SpeakPriority", + "ThinkResponse", + "ThinkOnListeningAction", + "ThinkOnThinkingAction", + "ThinkOnSpeakingAction", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + "AgentThinkRequestOnSpeakingAction", + "is_avatar_token_managed", + "is_rtc_avatar", + "AgentPresets", + "DeepgramPresetModels", + "OpenAIPresetModels", + "OpenAITtsPresetModels", + "MiniMaxPresetModels", + "normalize_preset_input", + "generate_rtc_token", + "GenerateTokenOptions", + "generate_convo_ai_token", + "GenerateConvoAITokenOptions", + "MAX_EXPIRY_SECONDS", + "expires_in_hours", + "expires_in_minutes", + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + "is_heygen_avatar", + "is_live_avatar_avatar", + "is_akool_avatar", + "is_anam_avatar", + "is_generic_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection + from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode + from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs + from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode + from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse, + ) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + AsrConfig = SttConfig + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + MllmVendor = StartAgentsRequestPropertiesMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + # LLM sub-type aliases + LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs + LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode + McpServersItem = StartAgentsRequestPropertiesLlmMcpServersItem + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + # Think type aliases and response + ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + ThinkResponse = AgentThinkAgentManagementResponse + + from .token import generate_convo_ai_token, _validate_expires_in + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Examples + -------- + >>> from agora_agent.agentkit import Agent + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + ): + self._name = name + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + The failure message is played via TTS when the LLM call fails. + """ + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Returns a new Agent with the specified maximum conversation history length.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + account=agent_uid, + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + if self._stt is not None: + base_kwargs["asr"] = self._stt + + return StartAgentsRequestProperties(**base_kwargs) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + account=self._agent_uid, + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + account=str(params["agora_uid"]), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + src/agora_agent/agentkit/avatar_types.py: | + import warnings + import typing + + + def is_heygen_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "heygen" + + + def is_live_avatar_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "liveavatar" + + + def is_akool_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "akool" + + + def is_anam_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "anam" + + + def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "generic" + + + def is_avatar_token_managed(config: typing.Dict[str, typing.Any]) -> bool: + """Return True when AgentKit manages the avatar RTC publisher identity.""" + return ( + is_heygen_avatar(config) + or is_live_avatar_avatar(config) + or is_generic_avatar(config) + ) + + + def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + """Deprecated: use :func:`is_avatar_token_managed` for vendor gating.""" + warnings.warn( + "is_rtc_avatar is deprecated; use is_avatar_token_managed for vendor gating " + "and keep agora_uid checks in session enrichment.", + DeprecationWarning, + stacklevel=2, + ) + params = config.get("params", {}) + return isinstance(params, dict) and bool(params.get("agora_uid")) and is_avatar_token_managed(config) + + + def validate_avatar_config( + config: typing.Dict[str, typing.Any], + require_session_fields: bool = False, + ) -> None: + """Validates avatar configuration at runtime. + + Parameters + ---------- + config : dict + The avatar configuration dictionary. + + Raises + ------ + ValueError + If the configuration is invalid. + """ + if is_heygen_avatar(config) or is_live_avatar_avatar(config): + label = "HeyGen" if is_heygen_avatar(config) else "LiveAvatar" + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError(f"{label} avatar requires api_key") + if not params.get("quality"): + raise ValueError(f"{label} avatar requires quality (low, medium, or high)") + if not params.get("agora_uid"): + raise ValueError(f"{label} avatar requires agora_uid") + valid_qualities = ("low", "medium", "high") + if params.get("quality") not in valid_qualities: + raise ValueError( + f"Invalid quality for {label}: {params.get('quality')}. " + f"Must be one of: {', '.join(valid_qualities)}" + ) + if require_session_fields and not params.get("agora_token"): + raise ValueError(f"{label} avatar requires agora_token after session enrichment") + elif is_akool_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Akool avatar requires api_key") + elif is_anam_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Anam avatar requires api_key") + elif is_generic_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Generic avatar requires api_key") + if not params.get("api_base_url"): + raise ValueError("Generic avatar requires api_base_url") + if not params.get("avatar_id"): + raise ValueError("Generic avatar requires avatar_id") + if not params.get("agora_uid"): + raise ValueError("Generic avatar requires agora_uid") + if require_session_fields: + if not params.get("agora_token"): + raise ValueError("Generic avatar requires agora_token after session enrichment") + if not params.get("agora_appid"): + raise ValueError("Generic avatar requires agora_appid after session enrichment") + if not params.get("agora_channel"): + raise ValueError("Generic avatar requires agora_channel after session enrichment") + + + def validate_tts_sample_rate( + avatar_config: typing.Dict[str, typing.Any], + tts_sample_rate: int, + ) -> None: + """Validates that TTS sample rate is compatible with the avatar vendor. + + Different avatar vendors have specific sample rate requirements: + - HeyGen/LiveAvatar: ONLY supports 24,000 Hz + - Akool: ONLY supports 16,000 Hz + + Parameters + ---------- + avatar_config : dict + The avatar configuration dictionary. + tts_sample_rate : int + The sample rate from your TTS configuration (in Hz). + + Raises + ------ + ValueError + If TTS sample rate is incompatible with the avatar vendor. + """ + if is_heygen_avatar(avatar_config) or is_live_avatar_avatar(avatar_config): + if tts_sample_rate != 24000: + label = "HeyGen" if is_heygen_avatar(avatar_config) else "LiveAvatar" + raise ValueError( + f"{label} avatars ONLY support 24,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 24kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/overview" + ) + elif is_akool_avatar(avatar_config): + if tts_sample_rate != 16000: + raise ValueError( + f"Akool avatars ONLY support 16,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 16kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/akool" + ) + src/agora_agent/agentkit/constants.py: | + """ + Type-safe constants for agent configuration values. + Use these instead of raw strings to avoid typos and get IDE autocomplete. + """ + + # Data channel: "rtm" | "datastream" + class DataChannel: + RTM = "rtm" + DATASTREAM = "datastream" + + class AudioScenario: + DEFAULT = "default" + CHORUS = "chorus" + AISERVER = "aiserver" + + + # Silence action when timeout elapses: "speak" | "think" + # (Use for parameters.silence_config.action — avoids shadowing SilenceAction type) + class SilenceActionValues: + SPEAK = "speak" + THINK = "think" + + + # SAL mode: "locking" | "recognition" + # (Use for sal.sal_mode — avoids shadowing SalMode type) + class SalModeValues: + LOCKING = "locking" + RECOGNITION = "recognition" + + + # Geofence area: "GLOBAL" | "NORTH_AMERICA" | "EUROPE" | "ASIA" | "INDIA" | "JAPAN" + class GeofenceArea: + GLOBAL = "GLOBAL" + NORTH_AMERICA = "NORTH_AMERICA" + EUROPE = "EUROPE" + ASIA = "ASIA" + INDIA = "INDIA" + JAPAN = "JAPAN" + + + # Geofence exclude area (when area is GLOBAL) + class GeofenceExcludeArea: + NORTH_AMERICA = "NORTH_AMERICA" + EUROPE = "EUROPE" + ASIA = "ASIA" + INDIA = "INDIA" + JAPAN = "JAPAN" + + + # Filler word selection rule: "shuffle" | "round_robin" + class FillerWordsSelectionRule: + SHUFFLE = "shuffle" + ROUND_ROBIN = "round_robin" + + + # Turn detection type (deprecated; use TurnDetectionNestedConfig.EndOfSpeech instead) + class TurnDetectionTypeValues: + AGORA_VAD = "agora_vad" + SERVER_VAD = "server_vad" + SEMANTIC_VAD = "semantic_vad" + + + # Think action value constants (match Fern wire values) + ThinkOnListeningActionInject = "inject" + ThinkOnListeningActionInterrupt = "interrupt" + ThinkOnListeningActionIgnore = "ignore" + ThinkOnThinkingActionInterrupt = "interrupt" + ThinkOnThinkingActionIgnore = "ignore" + ThinkOnSpeakingActionInterrupt = "interrupt" + ThinkOnSpeakingActionIgnore = "ignore" + status: unresolved + - id: patch-972dd5bd + content_hash: sha256:10f86db20e0b5a3800efce4913b736ff338dee29eb18cb31e89658e0293b848e + original_commit: 972dd5bdafc09b3981ab2ce4e0d02beae165c626 + original_message: updated docs + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/reference/agent.md + patch_content: |+ + From 972dd5bdafc09b3981ab2ce4e0d02beae165c626 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Thu, 21 May 2026 16:13:35 -0400 + Subject: [PATCH] updated docs + + --- + docs/reference/agent.md | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + + diff --git a/docs/reference/agent.md b/docs/reference/agent.md + index 1e88b8b..3163f9c 100644 + --- a/docs/reference/agent.md + +++ b/docs/reference/agent.md + @@ -264,3 +264,18 @@ to_properties( + | `rtc` | `Optional[RtcConfig]` | RTC configuration | + | `filler_words` | `Optional[FillerWordsConfig]` | Filler words configuration | + | `config` | `Dict[str, Any]` | Full configuration dict | + + + +## Type aliases + + + +Public aliases over Fern-generated types: `LlmConfig`, `SttConfig`, `AsrConfig` (= `SttConfig`), `MllmConfig`, `AvatarConfig`, session/conversation types, and think types (`ThinkOnListeningAction`, etc.). + + + +Think value constants: `ThinkOnListeningActionInject`, `ThinkOnListeningActionInterrupt`, `ThinkOnListeningActionIgnore`, `ThinkOnThinkingActionInterrupt`, `ThinkOnThinkingActionIgnore`, `ThinkOnSpeakingActionInterrupt`, `ThinkOnSpeakingActionIgnore`. + + + +## Cross-SDK discovery map + + + +| Concept | Python | TypeScript | Go | + +|---|---|---|---| + +| STT payload alias (wire: `asr`) | `SttConfig` / `AsrConfig` | `SttConfig` / `AsrConfig` | `AsrConfig` / `SttConfig` | + +| xAI MLLM (primary) | `XaiGrok` | `XaiGrok` | `XaiGrok` / `NewXaiGrok` | + +| Avatar token helper | `is_avatar_token_managed` | `isAvatarTokenManaged` | `IsAvatarTokenManaged` | + +| Think inject constant | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | + -- + 2.52.0 + + theirs_snapshot: + docs/reference/agent.md: | + --- + sidebar_position: 2 + title: Agent + description: Full API reference for the Python Agent builder class. + --- + + # Agent Reference + + **Import:** `from agora_agent.agentkit import Agent` or `from agora_agent import Agent` + + ## Constructor + + + ```python + Agent( + name: Optional[str] = None, + instructions: Optional[str] = None, + turn_detection: Optional[TurnDetectionConfig] = None, + interruption: Optional[InterruptionConfig] = None, + sal: Optional[SalConfig] = None, + advanced_features: Optional[Dict[str, Any]] = None, + parameters: Optional[SessionParams] = None, + greeting: Optional[str] = None, + failure_message: Optional[str] = None, + max_history: Optional[int] = None, + geofence: Optional[GeofenceConfig] = None, + labels: Optional[Dict[str, str]] = None, + rtc: Optional[RtcConfig] = None, + filler_words: Optional[FillerWordsConfig] = None, + ) + ``` + + | Parameter | Type | Default | Description | + |---|---|---|---| + | `name` | `Optional[str]` | `None` | Agent name, used as default session name | + | `instructions` | `Optional[str]` | `None` | System prompt for the LLM | + | `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Turn detection configuration | + | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | + | `sal` | `Optional[SalConfig]` | `None` | Speech Activity Level configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | `None` | Advanced features dict (e.g., `{'enable_rtm': True}`) | + | `parameters` | `Optional[SessionParams]` | `None` | Additional session parameters | + | `greeting` | `Optional[str]` | `None` | Auto-spoken greeting when agent joins | + | `failure_message` | `Optional[str]` | `None` | Spoken on error | + | `max_history` | `Optional[int]` | `None` | Max conversation history length | + | `geofence` | `Optional[GeofenceConfig]` | `None` | Regional access restriction | + | `labels` | `Optional[Dict[str, str]]` | `None` | Custom key-value labels (returned in callbacks) | + | `rtc` | `Optional[RtcConfig]` | `None` | RTC media encryption | + | `filler_words` | `Optional[FillerWordsConfig]` | `None` | Filler words while waiting for LLM | + + ## Builder Methods + + All builder methods return a new `Agent` instance (immutable pattern). + + ### `with_llm(vendor: BaseLLM) -> Agent` + + Set the LLM vendor for cascading flow. + + + ```python + from agora_agent.agentkit.vendors import OpenAI + agent = Agent().with_llm(OpenAI(api_key='your-key', model='gpt-4o-mini')) + ``` + + ### `with_tts(vendor: BaseTTS) -> Agent` + + Set the TTS vendor. Records the vendor's `sample_rate` for avatar validation. + + + ```python + from agora_agent.agentkit.vendors import ElevenLabsTTS + agent = Agent().with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id')) + ``` + + ### `with_stt(vendor: BaseSTT) -> Agent` + + Set the STT (ASR) vendor. + + + ```python + from agora_agent.agentkit.vendors import DeepgramSTT + agent = Agent().with_stt(DeepgramSTT(api_key='your-key', language='en-US')) + ``` + + ### `with_mllm(vendor: BaseMLLM) -> Agent` + + Set the MLLM vendor for multimodal flow. Calling `with_mllm()` automatically sets `mllm.enable = True`. MLLM sessions do not require TTS, STT, or LLM vendors. + + + ```python + from agora_agent.agentkit.vendors import OpenAIRealtime + agent = Agent().with_mllm(OpenAIRealtime(api_key='your-key')) + ``` + + ### `with_avatar(vendor: BaseAvatar) -> Agent` + + Set the avatar vendor for the cascading ASR + LLM + TTS pipeline. Avatars are not supported when MLLM is enabled — combining `with_mllm()` and an enabled `with_avatar()` is rejected at `to_properties()` and `AgentSession.start()`. A disabled avatar (`enable=False`) is allowed alongside MLLM. + + Raises `ValueError` if the TTS sample rate does not match the avatar's `required_sample_rate`. + + + ```python + from agora_agent.agentkit.vendors import HeyGenAvatar + agent = agent.with_avatar(HeyGenAvatar(api_key='your-key', quality='medium', agora_uid='2')) + ``` + + **Raises:** `ValueError` — `"Avatar requires TTS sample rate of {required} Hz, but TTS is configured with {actual} Hz. Please update your TTS sample_rate to {required}."` + + ### `with_turn_detection(config: TurnDetectionConfig) -> Agent` + + Override cascading-flow turn detection settings. Use `config.start_of_speech` and `config.end_of_speech` for SOS/EOS detection. Use `with_interruption()` for interruption behavior and MLLM vendor `turn_detection` for MLLM turn detection. + + Pause-state detection is configured under semantic end-of-speech: + + ```python + agent = agent.with_turn_detection({ + "mode": "default", + "config": { + "end_of_speech": { + "mode": "semantic", + "semantic_config": { + "pause_state_enabled": True, + }, + }, + }, + }) + ``` + + ### `with_interruption(config: InterruptionConfig) -> Agent` + + Configure unified interruption behavior using the top-level `interruption` object. Use this for `start_of_speech` and `keywords` interruption modes. + + ### `with_instructions(instructions: str) -> Agent` + + Override the system prompt. + + ### `with_greeting(greeting: str) -> Agent` + + Override the greeting message. + + ### `with_name(name: str) -> Agent` + + Override the agent name. + + ### `with_sal(config: SalConfig) -> Agent` + + Set SAL (Selective Attention Locking) configuration. + + ### `with_advanced_features(features: AdvancedFeatures) -> Agent` + + Set advanced features (e.g. `{'enable_rtm': True}`). + + When `enable_rtm=True`, AgentKit defaults `parameters.data_channel` to `"rtm"` unless you explicitly set another data channel. + + ### `with_tools(enabled: bool = True) -> Agent` + + Enable or disable MCP tool invocation by setting `advanced_features.enable_tools`. + + ### `with_parameters(parameters: SessionParams) -> Agent` + + Set session parameters (silence config, farewell config, data channel, audio scenario, etc.). + + ### `with_audio_scenario(audio_scenario: ParametersAudioScenario) -> Agent` + + Set `parameters.audio_scenario` without replacing existing session parameters. + + ### `with_failure_message(message: str) -> Agent` + + Set the message spoken via TTS when the LLM call fails. + + ### `with_max_history(max_history: int) -> Agent` + + Set the maximum conversation history length for the standard ASR + LLM + TTS pipeline. The v2.7 MLLM core type does not expose `max_history`. + + ### `with_geofence(geofence: GeofenceConfig) -> Agent` + + Set geofence configuration (restricts backend server regions). + + ### `with_labels(labels: Dict[str, str]) -> Agent` + + Set custom labels (key-value pairs returned in notification callbacks). + + ### `with_rtc(rtc: RtcConfig) -> Agent` + + Set RTC configuration. + + ### `with_filler_words(filler_words: FillerWordsConfig) -> Agent` + + Set filler words configuration (played while waiting for LLM response). + + ## `create_session()` + + + ```python + create_session( + client: Any, + channel: str, + agent_uid: str, + remote_uids: List[str], + name: Optional[str] = None, + token: Optional[str] = None, + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + expires_in: Optional[int] = None, + ) -> AgentSession + ``` + + Creates an `AgentSession` bound to the given client and channel. + + | Parameter | Type | Required | Description | + |---|---|---|---| + | `client` | `Agora` or `AsyncAgora` | Yes | Authenticated client | + | `channel` | `str` | Yes | Channel name | + | `agent_uid` | `str` | Yes | UID for the agent | + | `remote_uids` | `List[str]` | Yes | UIDs of remote participants | + | `name` | `Optional[str]` | No | Session name (defaults to agent name) | + | `token` | `Optional[str]` | No | Pre-built RTC+RTM token | + | `expires_in` | `Optional[int]` | No | Token lifetime in seconds (default: `86400` = 24 h, Agora max). Only applies when the token is auto-generated. Use `expires_in_hours()` or `expires_in_minutes()` for clarity. Valid range: 1–86400. | + | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | + | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | + + **Returns:** `AgentSession` + + ## `to_properties()` + + Converts the agent configuration into a `StartAgentsRequestProperties` object for the Agora API. Called internally by `AgentSession.start()`. + + + ```python + to_properties( + channel: str, + agent_uid: str, + remote_uids: List[str], + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + token: Optional[str] = None, + app_id: Optional[str] = None, + app_certificate: Optional[str] = None, + expires_in: Optional[int] = None, + ) -> StartAgentsRequestProperties + ``` + + **Raises:** `ValueError` if neither `token` nor `app_id`+`app_certificate` is provided, or if required vendors (LLM, TTS) are missing in cascading mode. + + ## Properties + + | Property | Type | Description | + |---|---|---| + | `name` | `Optional[str]` | Agent name | + | `instructions` | `Optional[str]` | System prompt | + | `greeting` | `Optional[str]` | Greeting message | + | `failure_message` | `Optional[str]` | Message spoken when LLM fails | + | `max_history` | `Optional[int]` | Max conversation history length | + | `llm` | `Optional[Dict[str, Any]]` | LLM config dict (from `to_config()`) | + | `tts` | `Optional[Dict[str, Any]]` | TTS config dict | + | `stt` | `Optional[Dict[str, Any]]` | STT config dict | + | `mllm` | `Optional[Dict[str, Any]]` | MLLM config dict | + | `avatar` | `Optional[Dict[str, Any]]` | Avatar config dict | + | `turn_detection` | `Optional[TurnDetectionConfig]` | Turn detection settings | + | `sal` | `Optional[SalConfig]` | SAL configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | Advanced features | + | `parameters` | `Optional[SessionParams]` | Session parameters | + | `geofence` | `Optional[GeofenceConfig]` | Geofence configuration | + | `labels` | `Optional[Dict[str, str]]` | Custom labels | + | `rtc` | `Optional[RtcConfig]` | RTC configuration | + | `filler_words` | `Optional[FillerWordsConfig]` | Filler words configuration | + | `config` | `Dict[str, Any]` | Full configuration dict | + + ## Type aliases + + Public aliases over Fern-generated types: `LlmConfig`, `SttConfig`, `AsrConfig` (= `SttConfig`), `MllmConfig`, `AvatarConfig`, session/conversation types, and think types (`ThinkOnListeningAction`, etc.). + + Think value constants: `ThinkOnListeningActionInject`, `ThinkOnListeningActionInterrupt`, `ThinkOnListeningActionIgnore`, `ThinkOnThinkingActionInterrupt`, `ThinkOnThinkingActionIgnore`, `ThinkOnSpeakingActionInterrupt`, `ThinkOnSpeakingActionIgnore`. + + ## Cross-SDK discovery map + + | Concept | Python | TypeScript | Go | + |---|---|---|---| + | STT payload alias (wire: `asr`) | `SttConfig` / `AsrConfig` | `SttConfig` / `AsrConfig` | `AsrConfig` / `SttConfig` | + | xAI MLLM (primary) | `XaiGrok` | `XaiGrok` | `XaiGrok` / `NewXaiGrok` | + | Avatar token helper | `is_avatar_token_managed` | `isAvatarTokenManaged` | `IsAvatarTokenManaged` | + | Think inject constant | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | + status: unresolved + - id: patch-7465fada + content_hash: sha256:9c6ed2e5f48702293eed8b213cc31cce63a7ed5a1ad16a0b23e791c13e77746f + original_commit: 7465fadafa0f1e62051d99b42d0eeda85f31eeee + original_message: "fix(agentkit): resolve Python session typing issues" + original_author: digitallysavvy + base_generation: b66d871314ca0e5929cb9c9095949a7fd5e856a7 + files: + - src/agora_agent/agentkit/agent_session.py + patch_content: | + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index dbff562..dca9ee8 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -24,6 +24,7 @@ from .avatar_types import ( + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + theirs_snapshot: + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent, GetTurnsOptions, SayOptions, ThinkOptions + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import ( + get_preset_category, + infer_asr_preset, + infer_llm_preset, + infer_tts_preset, + normalize_preset_input, + resolve_session_presets, + ) + from .token import generate_convo_ai_token, _parse_numeric_uid + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, preset, + pipeline_id, expires_in, debug, warn + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(self._agent_uid, "agent_uid"), + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(str(params["agora_uid"]), "avatar agora_uid"), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties( + self, + token_opts: typing.Dict[str, typing.Any], + skip_vendor_validation_categories: typing.AbstractSet[str], + allow_missing_vendor_categories: typing.AbstractSet[str], + ) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation_categories=skip_vendor_validation_categories, + allow_missing_vendor_categories=allow_missing_vendor_categories, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.greeting_configs is not None: + llm["greeting_configs"] = self._dump_model(self._agent.greeting_configs) + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + def _vendor_validation_categories( + self, + pipeline_id: typing.Optional[str], + ) -> typing.Tuple[typing.Set[str], typing.Set[str]]: + skip_categories: typing.Set[str] = set() + allow_missing_categories: typing.Set[str] = {"asr", "llm", "tts"} if pipeline_id else set() + + preset = normalize_preset_input(self._preset) + if preset: + for item in preset.split(","): + category = get_preset_category(item) + if category is not None: + skip_categories.add(category) + allow_missing_categories.add(category) + + if infer_asr_preset(self._agent.stt): + skip_categories.add("asr") + if infer_llm_preset(self._agent.llm): + skip_categories.add("llm") + if infer_tts_preset(self._agent.tts): + skip_categories.add("tts") + return skip_categories, allow_missing_categories + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @staticmethod + def _response_pagination(response: typing.Any) -> typing.Any: + if isinstance(response, dict): + return response.get("pagination") + return getattr(response, "pagination", None) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> agent = agent.with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + pipeline_id = self._pipeline_id if self._pipeline_id is not None else self._agent.pipeline_id + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + skip_categories, allow_missing_categories = self._vendor_validation_categories(pipeline_id) + properties = self._build_start_properties( + token_opts, + skip_vendor_validation_categories=skip_categories, + allow_missing_vendor_categories=allow_missing_categories, + ) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> agent = agent.with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + pipeline_id = self._pipeline_id if self._pipeline_id is not None else self._agent.pipeline_id + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + skip_categories, allow_missing_categories = self._vendor_validation_categories(pipeline_id) + properties = self._build_start_properties( + token_opts, + skip_vendor_validation_categories=skip_categories, + allow_missing_vendor_categories=allow_missing_categories, + ) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + - id: patch-d29165c4 + content_hash: sha256:be59d1d3efc435d5e0b83305b2cd39ce3dad4534a4125de18028c137e692e659 + original_commit: d29165c4ddd8296af703a4e9ed848516f563dd1b + original_message: make python compat package publishable + original_author: chenyuguo + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - compat/agora-agent-server-sdk/README.md + - compat/agora-agent-server-sdk/pyproject.toml + - compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + patch_content: |+ + From d29165c4ddd8296af703a4e9ed848516f563dd1b Mon Sep 17 00:00:00 2001 + From: chenyuguo + Date: Wed, 27 May 2026 17:24:50 +0800 + Subject: [PATCH] make python compat package publishable + + --- + compat/agora-agent-server-sdk/README.md | 2 ++ + compat/agora-agent-server-sdk/pyproject.toml | 3 +++ + .../src/agora_agent_server_sdk_compat/__init__.py | 1 + + 3 files changed, 6 insertions(+) + create mode 100644 compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + + diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md + index 1388836..cff3cfe 100644 + --- a/compat/agora-agent-server-sdk/README.md + +++ b/compat/agora-agent-server-sdk/README.md + @@ -9,3 +9,5 @@ pip install agora-agents + ``` + + This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. + + + +It intentionally contains only a minimal compatibility module so the distribution can be built and published cleanly with Poetry. + diff --git a/compat/agora-agent-server-sdk/pyproject.toml b/compat/agora-agent-server-sdk/pyproject.toml + index 8efbe53..ac93128 100644 + --- a/compat/agora-agent-server-sdk/pyproject.toml + +++ b/compat/agora-agent-server-sdk/pyproject.toml + @@ -26,6 +26,9 @@ classifiers = [ + "Topic :: Software Development :: Libraries :: Python Modules", + "Typing :: Typed" + ] + +packages = [ + + { include = "agora_agent_server_sdk_compat", from = "src"} + +] + + [tool.poetry.urls] + Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python' + diff --git a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + new file mode 100644 + index 0000000..55522c6 + --- /dev/null + +++ b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + @@ -0,0 +1 @@ + +"""Compatibility package for the renamed agora-agents distribution.""" + -- + 2.52.0 + + theirs_snapshot: + compat/agora-agent-server-sdk/README.md: | + # agora-agent-server-sdk + + This package has been renamed to `agora-agents`. + + New projects should install: + + ```sh + pip install agora-agents + ``` + + This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. + + It intentionally contains only a minimal compatibility module so the distribution can be built and published cleanly with Poetry. + compat/agora-agent-server-sdk/pyproject.toml: | + [project] + name = "agora-agent-server-sdk" + + [tool.poetry] + name = "agora-agent-server-sdk" + version = "v2.0.0" + description = "Compatibility shim for the renamed agora-agents package." + readme = "README.md" + authors = [] + keywords = [] + + classifiers = [ + "Intended Audience :: Developers", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Operating System :: OS Independent", + "Operating System :: POSIX", + "Operating System :: MacOS", + "Operating System :: POSIX :: Linux", + "Operating System :: Microsoft :: Windows", + "Topic :: Software Development :: Libraries :: Python Modules", + "Typing :: Typed" + ] + packages = [ + { include = "agora_agent_server_sdk_compat", from = "src"} + ] + + [tool.poetry.urls] + Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python' + + [tool.poetry.dependencies] + python = "^3.8" + agora-agents = ">=2.0.0,<3.0.0" + + [build-system] + requires = ["poetry-core"] + build-backend = "poetry.core.masonry.api" + compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py: | + """Compatibility package for the renamed agora-agents distribution.""" + status: unresolved + - id: patch-fae1249a + content_hash: sha256:01bf21f3cc4c784dfcff80a48c9c7bb3123af4327a567b7c990b528e9780e9a2 + original_commit: fae1249a20c53761a2eb5515a1bf92ca666760d1 + original_message: Re-export agora-agents API from legacy PyPI compatibility package The compat distribution delegates to agora_agent via __getattr__ and documents both import paths in its README. + original_author: digitallysavvy + base_generation: b66d871314ca0e5929cb9c9095949a7fd5e856a7 + files: + - compat/agora-agent-server-sdk/README.md + - compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + patch_content: |+ + From fae1249a20c53761a2eb5515a1bf92ca666760d1 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 27 May 2026 16:58:18 -0400 + Subject: [PATCH] Re-export agora-agents API from legacy PyPI compatibility + package The compat distribution delegates to agora_agent via __getattr__ and + documents both import paths in its README. + + --- + compat/agora-agent-server-sdk/README.md | 7 +++++-- + .../src/agora_agent_server_sdk_compat/__init__.py | 14 +++++++++++++- + 2 files changed, 18 insertions(+), 3 deletions(-) + + diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md + index cff3cfe..e43d1d8 100644 + --- a/compat/agora-agent-server-sdk/README.md + +++ b/compat/agora-agent-server-sdk/README.md + @@ -8,6 +8,9 @@ New projects should install: + pip install agora-agents + ``` + + -This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. + +This compatibility package re-exports the public API from `agora-agents` to support existing installs during the migration window. The primary import path remains `agora_agent`; you can also import from `agora_agent_server_sdk_compat`: + + -It intentionally contains only a minimal compatibility module so the distribution can be built and published cleanly with Poetry. + +```python + +from agora_agent import Agora, Area + +from agora_agent_server_sdk_compat import Agora, Area + +``` + diff --git a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + index 55522c6..6283244 100644 + --- a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + +++ b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + @@ -1 +1,13 @@ + -"""Compatibility package for the renamed agora-agents distribution.""" + +"""Compatibility re-exports for the renamed agora-agents package.""" + + + +import agora_agent as _agora_agent + + + +__all__ = getattr(_agora_agent, "__all__", []) + + + + + +def __getattr__(name: str): + + return getattr(_agora_agent, name) + + + + + +def __dir__(): + + return dir(_agora_agent) + -- + 2.52.0 + + theirs_snapshot: + compat/agora-agent-server-sdk/README.md: | + # agora-agent-server-sdk + + This package has been renamed to `agora-agents`. + + New projects should install: + + ```sh + pip install agora-agents + ``` + + This compatibility package re-exports the public API from `agora-agents` to support existing installs during the migration window. The primary import path remains `agora_agent`; you can also import from `agora_agent_server_sdk_compat`: + + ```python + from agora_agent import Agora, Area + from agora_agent_server_sdk_compat import Agora, Area + ``` + compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py: | + """Compatibility re-exports for the renamed agora-agents package.""" + + import agora_agent as _agora_agent + + __all__ = getattr(_agora_agent, "__all__", []) + + + def __getattr__(name: str): + return getattr(_agora_agent, name) + + + def __dir__(): + return dir(_agora_agent) + user_owned: true + - id: patch-fc9d93c3 + content_hash: sha256:93877741bdad745fda5dd549d7c3dd6bc315f4574aabd2defb52c0c795bff011 + original_commit: fc9d93c3026a6109d8a5e8b386418592f8d121c5 + original_message: Document agora-agents PyPI install name and migration notes + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/getting-started/installation.md + patch_content: | + diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md + index c14bdb2..f6f1750 100644 + --- a/docs/getting-started/installation.md + +++ b/docs/getting-started/installation.md + @@ -13,13 +13,13 @@ description: Install the Agora Conversational AI Python SDK. + ## Install with pip + + ```sh + -pip install agora-agent-sdk + +pip install agora-agents + ``` + + ## Install with Poetry + + ```sh + -poetry add agora-agent-sdk + +poetry add agora-agents + ``` + + ## Dependencies + theirs_snapshot: + docs/getting-started/installation.md: | + --- + sidebar_position: 1 + title: Installation + description: Install the Agora Conversational AI Python SDK. + --- + + # Installation + + ## Prerequisites + + - Python >= 3.8 + + ## Install with pip + + ```sh + pip install agora-agents + ``` + + ## Install with Poetry + + ```sh + poetry add agora-agents + ``` + + ## Dependencies + + The following packages are installed automatically: + + | Package | Purpose | + |---|---| + | `httpx` (>= 0.21.2) | HTTP client for sync and async requests | + | `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | + | `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | + + ## Sync vs. Async + + The SDK supports both synchronous and asynchronous usage: + + - **Synchronous** — import `Agora` from `agora_agent` and use blocking method calls + - **Asynchronous** — import `AsyncAgora` and `AsyncAgentSession` from `agora_agent` and use `await` with all API calls + + ```python + # Sync + from agora_agent import Agora, Area + + # Async + from agora_agent import AsyncAgora, AsyncAgentSession, Area + ``` + + Both clients share the same constructor parameters and capabilities. See [Authentication](./authentication.md) for setup details. + status: unresolved + - id: patch-44c21c14 + content_hash: sha256:920a8a5905a3bbb134edb28b007c5c0b1b4b2c1f75753140fef305b14a64e3e0 + original_commit: 44c21c14a14aa7ad469a18ce86024ff14ca2bf9b + original_message: Re-export AgentKit symbols from agora_agent package root Extend __getattr__ and __all__ so vendor classes, presets, and helpers are importable via `from agora_agent import ...`. Add tests and update class docstring examples to use the root import path. + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + - tests/custom/test_root_exports.py + patch_content: | + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index a820291..f84862c 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -231,8 +231,7 @@ class Agent: + + Examples + -------- + - >>> from agora_agent.agentkit import Agent + - >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT + + >>> from agora_agent import Agent, OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index fb8e548..a749d1e 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -412,12 +412,10 @@ class AgentSession(_AgentSessionBase): + + Examples + -------- + - >>> from agora_agent import Agora, Area + - >>> from agora_agent.agentkit import Agent + + >>> from agora_agent import Agora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + - >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + @@ -735,12 +733,10 @@ class AsyncAgentSession(_AgentSessionBase): + + Examples + -------- + - >>> from agora_agent import AsyncAgora, Area + - >>> from agora_agent.agentkit import Agent + + >>> from agora_agent import AsyncAgora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + - >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + diff --git a/tests/custom/test_root_exports.py b/tests/custom/test_root_exports.py + new file mode 100644 + index 0000000..9b2f508 + --- /dev/null + +++ b/tests/custom/test_root_exports.py + @@ -0,0 +1,29 @@ + +import pytest + + + +import agora_agent + +import agora_agent.agentkit as agentkit + + + + + +def test_root_exports_match_agentkit_for_common_symbols() -> None: + + for name in ("Agent", "DeepgramSTT", "OpenAI", "AgentPresets", "generate_rtc_token", "DataChannel"): + + assert getattr(agora_agent, name) is getattr(agentkit, name) + + + + + +def test_root_exports_fern_client_symbols() -> None: + + assert agora_agent.Agora is not None + + assert agora_agent.Area is not None + + assert agora_agent.AsyncAgora is not None + + + + + +def test_unknown_root_export_raises_attribute_error() -> None: + + with pytest.raises(AttributeError): + + _ = agora_agent.NotARealExportName + + + + + +def test_dir_includes_agentkit_vendor_exports() -> None: + + assert "DeepgramSTT" in dir(agora_agent) + + + + + +def test_all_includes_agentkit_vendor_exports() -> None: + + assert "DeepgramSTT" in agora_agent.__all__ + + assert "OpenAI" in agora_agent.__all__ + theirs_snapshot: + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.list_agents_response_data_list_item_status import ListAgentsResponseDataListItemStatus + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection + from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode + from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs + from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode + from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse, + ) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + AsrConfig = SttConfig + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + MllmVendor = StartAgentsRequestPropertiesMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + SessionStatus = ListAgentsResponseDataListItemStatus + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + + class ThinkOptions(typing_extensions.TypedDict, total=False): + on_listening_action: AgentThinkAgentManagementRequestOnListeningAction + on_thinking_action: AgentThinkAgentManagementRequestOnThinkingAction + on_speaking_action: AgentThinkAgentManagementRequestOnSpeakingAction + interruptable: bool + metadata: typing.Dict[str, str] + + + class GetTurnsOptions(typing_extensions.TypedDict, total=False): + page_index: int + page_size: int + + + class SayOptions(typing_extensions.TypedDict, total=False): + priority: SpeakAgentsRequestPriority + interruptable: bool + + + class SessionOptions(typing_extensions.TypedDict, total=False): + name: str + channel: str + token: str + agent_uid: str + remote_uids: typing.List[str] + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + # LLM sub-type aliases + LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs + LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode + McpServersItem = StartAgentsRequestPropertiesLlmMcpServersItem + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + # Think type aliases and response + ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + ThinkResponse = AgentThinkAgentManagementResponse + + from .token import generate_convo_ai_token, _parse_numeric_uid, _validate_expires_in + + + def _dump_optional_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Examples + -------- + >>> from agora_agent import Agent, OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + greeting_configs: typing.Optional[LlmGreetingConfigs] = None, + ): + self._name = name + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + self._greeting_configs = greeting_configs + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_greeting_configs(self, configs: LlmGreetingConfigs) -> "Agent": + """Returns a new Agent with greeting playback configuration.""" + new_agent = self._clone() + new_agent._greeting_configs = configs + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + The failure message is played via TTS when the LLM call fails. + """ + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Returns a new Agent with the specified maximum conversation history length.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def greeting_configs(self) -> typing.Optional[LlmGreetingConfigs]: + return self._greeting_configs + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + "greeting_configs": self._greeting_configs, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + uid=_parse_numeric_uid(agent_uid, "agent_uid"), + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._greeting_configs is not None: + llm_config["greeting_configs"] = _dump_optional_model(self._greeting_configs) + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + if self._stt is not None: + base_kwargs["asr"] = self._stt + + return StartAgentsRequestProperties(**base_kwargs) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + new_agent._greeting_configs = self._greeting_configs + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent, GetTurnsOptions, SayOptions, ThinkOptions + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token, _parse_numeric_uid + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(self._agent_uid, "agent_uid"), + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(str(params["agora_uid"]), "avatar agora_uid"), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.greeting_configs is not None: + llm["greeting_configs"] = self._dump_model(self._agent.greeting_configs) + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @staticmethod + def _response_pagination(response: typing.Any) -> typing.Any: + if isinstance(response, dict): + return response.get("pagination") + return getattr(response, "pagination", None) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + tests/custom/test_root_exports.py: | + import pytest + + import agora_agent + import agora_agent.agentkit as agentkit + + + def test_root_exports_match_agentkit_for_common_symbols() -> None: + for name in ("Agent", "DeepgramSTT", "OpenAI", "AgentPresets", "generate_rtc_token", "DataChannel"): + assert getattr(agora_agent, name) is getattr(agentkit, name) + + + def test_root_exports_fern_client_symbols() -> None: + assert agora_agent.Agora is not None + assert agora_agent.Area is not None + assert agora_agent.AsyncAgora is not None + + + def test_unknown_root_export_raises_attribute_error() -> None: + with pytest.raises(AttributeError): + _ = agora_agent.NotARealExportName + + + def test_dir_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in dir(agora_agent) + + + def test_all_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in agora_agent.__all__ + assert "OpenAI" in agora_agent.__all__ + status: unresolved + - id: patch-d475306b + content_hash: sha256:407af5e7564d6e8d0b91f1e117cb433aec931f083225af53c6df2abfff281b22 + original_commit: d475306bd42279984bcf4934b900003e8e02c4eb + original_message: Move package rename guidance to installation docs and protect manual paths in Fern ignore. Consolidate migration notes into the installation guide with next-step links, add a brief README pointer, and exclude README, compat, and workflow files from Fern generation. + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - compat/agora-agent-server-sdk/README.md + - docs/getting-started/installation.md + patch_content: | + diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md + index e43d1d8..1da36aa 100644 + --- a/compat/agora-agent-server-sdk/README.md + +++ b/compat/agora-agent-server-sdk/README.md + @@ -14,3 +14,5 @@ This compatibility package re-exports the public API from `agora-agents` to supp + from agora_agent import Agora, Area + from agora_agent_server_sdk_compat import Agora, Area + ``` + + + +Maintainers: dual-publish steps live in the repository release workflow, not in the root README. + diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md + index 04b48da..8fca9ab 100644 + --- a/docs/getting-started/installation.md + +++ b/docs/getting-started/installation.md + @@ -53,4 +53,15 @@ from agora_agent import AsyncAgora, AsyncAgentSession, Area + | `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | + | `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | + + -See [Authentication](./authentication.md) for setup details. + +## Next steps + + + +- [Authentication](./authentication.md) — configure your credentials + +- [Quick Start](./quick-start.md) — build your first conversational agent + + + +## Migrating from a previous package name + + + +The PyPI distribution was renamed from `agora-agent-server-sdk` to `agora-agents` in v2.0.0. Install `agora-agents`; the import path remains `agora_agent`. + + + +The legacy PyPI name remains available as a compatibility shim that re-exports `agora-agents`. See [compat/agora-agent-server-sdk](../../compat/agora-agent-server-sdk/README.md). + + + +For release and version details, see [changelog — Migration notes](../../changelog.md#migration-notes). + theirs_snapshot: + compat/agora-agent-server-sdk/README.md: | + # agora-agent-server-sdk + + This package has been renamed to `agora-agents`. + + New projects should install: + + ```sh + pip install agora-agents + ``` + + This compatibility package re-exports the public API from `agora-agents` to support existing installs during the migration window. The primary import path remains `agora_agent`; you can also import from `agora_agent_server_sdk_compat`: + + ```python + from agora_agent import Agora, Area + from agora_agent_server_sdk_compat import Agora, Area + ``` + + Maintainers: dual-publish steps live in the repository release workflow, not in the root README. + docs/getting-started/installation.md: | + --- + sidebar_position: 1 + title: Installation + description: Install the Agora Conversational AI Python SDK. + --- + + # Installation + + ## Prerequisites + + - Python >= 3.8 + + ## Install with pip + + ```sh + pip install agora-agents + ``` + + ## Install with Poetry + + ```sh + poetry add agora-agents + ``` + + ## Imports + + ```python + from agora_agent import Agent, Agora, Area, DeepgramSTT, OpenAI + ``` + + The package installs as `agora-agents` and imports as `agora_agent`. + + ## Sync vs. Async + + The SDK supports both synchronous and asynchronous usage: + + - **Synchronous** — import `Agora` from `agora_agent` and use blocking method calls + - **Asynchronous** — import `AsyncAgora` and `AsyncAgentSession` from `agora_agent` and use `await` with all API calls + + ```python + # Sync + from agora_agent import Agora, Area + + # Async + from agora_agent import AsyncAgora, AsyncAgentSession, Area + ``` + + ## Dependencies + + | Package | Purpose | + | ------------------------------ | ------------------------------------------------------ | + | `httpx` (>= 0.21.2) | HTTP client for sync and async requests | + | `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | + | `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | + + ## Next steps + + - [Authentication](./authentication.md) — configure your credentials + - [Quick Start](./quick-start.md) — build your first conversational agent + + ## Migrating from a previous package name + + The PyPI distribution was renamed from `agora-agent-server-sdk` to `agora-agents` in v2.0.0. Install `agora-agents`; the import path remains `agora_agent`. + + The legacy PyPI name remains available as a compatibility shim that re-exports `agora-agents`. See [compat/agora-agent-server-sdk](../../compat/agora-agent-server-sdk/README.md). + + For release and version details, see [changelog — Migration notes](../../changelog.md#migration-notes). + status: unresolved + - id: patch-c9355576 + content_hash: sha256:83b3b6148b21f2b4d53ee67321777522f5f4e871b61ea3b23f3a6b88ca052769 + original_commit: c93555763ffd63267a737b3e430217a890f203db + original_message: Streamline Python docs and README for app-credentials-first onboarding. Remove duplicated low-level client examples from the README, de-emphasize legacy auth modes, refocus the low-level API guide on AgentKit with telephony escape hatches, and update Agora-managed model terminology. + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/getting-started/authentication.md + - docs/guides/low-level-api.md + patch_content: | + diff --git a/docs/getting-started/authentication.md b/docs/getting-started/authentication.md + index 31dcc56..74c62cd 100644 + --- a/docs/getting-started/authentication.md + +++ b/docs/getting-started/authentication.md + @@ -46,41 +46,6 @@ session = agent.create_session( + print(client.auth_mode) # "app-credentials" + ``` + + -## Other auth modes + +## Legacy auth modes + + -The SDK also supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. These are not recommended for new applications. + - + -### Token auth (`auth_token`) + - + -Pass a pre-minted Agora REST token on the client. You must also supply the RTC join token on `create_session(..., token=...)`. + - + -```python + -client = Agora( + - area=Area.US, + - app_id="your-app-id", + - app_certificate="your-app-certificate", + - auth_token="your-rest-auth-token", + -) + - + -session = agent.create_session( + - client, + - channel="room-123", + - agent_uid="1", + - remote_uids=["100"], + - token="your-rtc-join-token", + -) + -``` + - + -### Basic Auth (`customer_id` + `customer_secret`) + - + -Uses HTTP Basic Auth with Customer ID and Secret from Agora Console. Avoid for new integrations — the same credentials are sent on every request instead of minting fresh tokens. + - + -```python + -client = Agora( + - area=Area.US, + - app_id="your-app-id", + - app_certificate="your-app-certificate", + - customer_id="your-customer-id", + - customer_secret="your-customer-secret", + -) + -``` + +The generated client still supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. Do not use those modes for new session integrations. Use app credentials so AgentKit can mint short-lived ConvoAI REST auth and RTC join tokens for each session. + diff --git a/docs/guides/low-level-api.md b/docs/guides/low-level-api.md + index 6677b45..47397b7 100644 + --- a/docs/guides/low-level-api.md + +++ b/docs/guides/low-level-api.md + @@ -1,187 +1,55 @@ + --- + sidebar_position: 10 + title: Low-Level API + -description: Direct client.agents.start() usage without the builder pattern. + +description: Use generated clients for escape-hatch APIs while keeping agent sessions on AgentKit. + --- + + # Low-Level API + + -For full control over request payloads you can call the generated clients directly and pass raw types such as `StartAgentsRequestProperties`, `Tts_Elevenlabs`, and `StartAgentsRequestPropertiesAsr`. Use this when you need vendor or options not exposed by the agentkit, or when integrating with generated types from the API spec. + +Use the `Agent` builder and `AgentSession` for conversational agent starts. That path generates ConvoAI REST auth and RTC join tokens from `app_id` and `app_certificate`, so application code does not need prebuilt REST tokens, RTC tokens, Customer ID, or Customer Secret. + + -## Raw telephony and phone-number APIs + - + -AgentKit focuses on realtime agent session helpers. Telephony call status, call hangup, and phone-number management are exposed through the generated low-level clients: + - + -- `client.telephony` for call status and hangup operations + -- `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + +Generated clients are still available for API surface that AgentKit does not wrap yet, such as telephony and phone-number management. + + -## Cascading flow (ASR → LLM → TTS) + +## Client setup + + ```python + from agora_agent import Agora, Area + -from agora_agent.agents import ( + - StartAgentsRequestProperties, + - StartAgentsRequestPropertiesAsr, + - StartAgentsRequestPropertiesLlm, + -) + -from agora_agent.types.eleven_labs_tts_params import ElevenLabsTtsParams + -from agora_agent.types.tts import Tts_Elevenlabs + + client = Agora( + area=Area.US, + - app_id="YOUR_APP_ID", + - app_certificate="YOUR_APP_CERTIFICATE", + - auth_token="your-rest-auth-token", + -) + -client.agents.start( + - client.app_id, + - name="unique_name", + - properties=StartAgentsRequestProperties( + - channel="channel_name", + - token="token", + - agent_rtc_uid="1001", + - remote_rtc_uids=["1002"], + - idle_timeout=120, + - asr=StartAgentsRequestPropertiesAsr( + - language="en-US", + - vendor="deepgram", + - params={"api_key": "YOUR_DEEPGRAM_API_KEY"}, + - ), + - tts=Tts_Elevenlabs( + - params=ElevenLabsTtsParams( + - key="YOUR_ELEVENLABS_API_KEY", + - model_id="eleven_flash_v2_5", + - voice_id="pNInz6obpgDQGcFmaJgB", + - sample_rate=24000, + - ), + - ), + - llm=StartAgentsRequestPropertiesLlm( + - url="https://api.openai.com/v1/chat/completions", + - api_key="", + - system_messages=[ + - {"role": "system", "content": "You are a helpful chatbot."} + - ], + - params={"model": "gpt-4o-mini"}, + - max_history=32, + - greeting_message="Hello, how can I assist you today?", + - failure_message="Please hold on a second.", + - ), + - ), + + app_id="your-app-id", + + app_certificate="your-app-certificate", + ) + ``` + + -## Async (low-level) + +## Raw telephony and phone-number APIs + + -```python + -import asyncio + -from agora_agent import Area, AsyncAgora + -from agora_agent.agents import ( + - StartAgentsRequestProperties, + - StartAgentsRequestPropertiesAsr, + - StartAgentsRequestPropertiesLlm, + -) + -from agora_agent.types.eleven_labs_tts_params import ElevenLabsTtsParams + -from agora_agent.types.tts import Tts_Elevenlabs + +AgentKit focuses on realtime agent session helpers. Use generated clients for operational APIs: + + -client = AsyncAgora( + - area=Area.US, + - app_id="YOUR_APP_ID", + - app_certificate="YOUR_APP_CERTIFICATE", + - auth_token="your-rest-auth-token", + +- `client.telephony` for call status and hangup operations + +- `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + + + +```python + +calls = client.telephony.list( + + appid=client.app_id, + + type="sip", + ) + + -async def main() -> None: + - await client.agents.start( + - client.app_id, + - name="unique_name", + - properties=StartAgentsRequestProperties( + - channel="channel_name", + - token="token", + - agent_rtc_uid="1001", + - remote_rtc_uids=["1002"], + - idle_timeout=120, + - asr=StartAgentsRequestPropertiesAsr( + - language="en-US", + - vendor="deepgram", + - params={"api_key": "YOUR_DEEPGRAM_API_KEY"}, + - ), + - tts=Tts_Elevenlabs( + - params=ElevenLabsTtsParams( + - key="YOUR_ELEVENLABS_API_KEY", + - model_id="eleven_flash_v2_5", + - voice_id="pNInz6obpgDQGcFmaJgB", + - sample_rate=24000, + - ), + - ), + - llm=StartAgentsRequestPropertiesLlm( + - url="https://api.openai.com/v1/chat/completions", + - api_key="", + - system_messages=[ + - {"role": "system", "content": "You are a helpful chatbot."} + - ], + - params={"model": "gpt-4o-mini"}, + - max_history=32, + - greeting_message="Hello, how can I assist you today?", + - failure_message="Please hold on a second.", + - ), + - ), + - ) + - + -asyncio.run(main()) + +for call in calls: + + print(call.id, call.state) + ``` + + -## MLLM flow (multimodal) + +## Direct agent APIs + + -For real-time audio with OpenAI Realtime or Google Gemini Live, use the MLLM flow instead of the cascading ASR → LLM → TTS flow. See the [MLLM Overview](https://docs.agora.io/en/conversational-ai/models/mllm/overview). + +`client.agents` exposes the generated REST surface for advanced integrations. Prefer `agent.create_session(...).start()` for new session starts because it handles auth, token generation, vendor serialization, lifecycle state, and avatar enrichment. + + -```python + -from agora_agent import Agora, Area + -from agora_agent.agents import ( + - StartAgentsRequestProperties, + - StartAgentsRequestPropertiesMllm, + - StartAgentsRequestPropertiesMllmVendor, + - StartAgentsRequestPropertiesTts, + - StartAgentsRequestPropertiesTtsVendor, + - StartAgentsRequestPropertiesLlm, + -) + +If you need an endpoint that is not wrapped by `AgentSession`, use `session.raw` after creating the session: + + -client = Agora( + - area=Area.US, + - app_id="YOUR_APP_ID", + - app_certificate="YOUR_APP_CERTIFICATE", + - auth_token="your-rest-auth-token", + -) + - + -client.agents.start( + - client.app_id, + - name="mllm_agent", + - properties=StartAgentsRequestProperties( + - channel="channel_name", + - token="your_token", + - agent_rtc_uid="1001", + - remote_rtc_uids=["1002"], + - idle_timeout=120, + - mllm=StartAgentsRequestPropertiesMllm( + - enable=True, + - url="wss://api.openai.com/v1/realtime", + - api_key="", + - vendor=StartAgentsRequestPropertiesMllmVendor.OPENAI, + - params={ + - "model": "gpt-4o-realtime-preview", + - "voice": "alloy", + - }, + - input_modalities=["audio"], + - output_modalities=["text", "audio"], + - greeting_message="Hello! I'm ready to chat in real-time.", + - turn_detection={ + - "mode": "server_vad", + - "server_vad_config": { + - "idle_timeout_ms": 5000, + - }, + - }, + - ), + - ), + +```python + +info = session.raw.get( + + appid=session.app_id, + + agent_id=session.id, + ) + ``` + + -For more on the agentkit-based MLLM flow, see [MLLM Flow](./mllm-flow.md). + +You must pass `appid` and `agent_id` manually when using generated raw methods. + theirs_snapshot: + docs/getting-started/authentication.md: | + --- + sidebar_position: 2 + title: Authentication + description: Configure the Python SDK with app credentials and understand other supported auth modes. + --- + + # Authentication + + Create `Agora` or `AsyncAgora` with `app_id` and `app_certificate` only. The SDK mints a fresh ConvoAI REST token for each API call and generates the RTC join token when the session starts. + + ## App credentials + + ```python + from agora_agent import Agent, Agora, Area, DeepgramSTT, OpenAI, MiniMaxTTS + + client = Agora( + area=Area.US, + app_id="your-app-id", + app_certificate="your-app-certificate", + ) + + agent = ( + Agent(instructions="Be concise.") + .with_stt(DeepgramSTT(model="nova-3")) + .with_llm(OpenAI(model="gpt-4o-mini")) + .with_tts(MiniMaxTTS(model="speech_2_6_turbo", voice_id="English_captivating_female1")) + ) + + session = agent.create_session( + client, + channel="room-123", + agent_uid="1", + remote_uids=["100"], + ) + ``` + + ## Why app credentials + + - Fresh short-lived tokens per API call instead of reusing long-lived credentials + - No Customer ID / Customer Secret in request headers + - No manual REST or RTC token provisioning in application code + + ## Inspecting auth mode + + ```python + print(client.auth_mode) # "app-credentials" + ``` + + ## Legacy auth modes + + The generated client still supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. Do not use those modes for new session integrations. Use app credentials so AgentKit can mint short-lived ConvoAI REST auth and RTC join tokens for each session. + docs/guides/low-level-api.md: | + --- + sidebar_position: 10 + title: Low-Level API + description: Use generated clients for escape-hatch APIs while keeping agent sessions on AgentKit. + --- + + # Low-Level API + + Use the `Agent` builder and `AgentSession` for conversational agent starts. That path generates ConvoAI REST auth and RTC join tokens from `app_id` and `app_certificate`, so application code does not need prebuilt REST tokens, RTC tokens, Customer ID, or Customer Secret. + + Generated clients are still available for API surface that AgentKit does not wrap yet, such as telephony and phone-number management. + + ## Client setup + + ```python + from agora_agent import Agora, Area + + client = Agora( + area=Area.US, + app_id="your-app-id", + app_certificate="your-app-certificate", + ) + ``` + + ## Raw telephony and phone-number APIs + + AgentKit focuses on realtime agent session helpers. Use generated clients for operational APIs: + + - `client.telephony` for call status and hangup operations + - `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + + ```python + calls = client.telephony.list( + appid=client.app_id, + type="sip", + ) + + for call in calls: + print(call.id, call.state) + ``` + + ## Direct agent APIs + + `client.agents` exposes the generated REST surface for advanced integrations. Prefer `agent.create_session(...).start()` for new session starts because it handles auth, token generation, vendor serialization, lifecycle state, and avatar enrichment. + + If you need an endpoint that is not wrapped by `AgentSession`, use `session.raw` after creating the session: + + ```python + info = session.raw.get( + appid=session.app_id, + agent_id=session.id, + ) + ``` + + You must pass `appid` and `agent_id` manually when using generated raw methods. + status: unresolved + - id: patch-299e4bd9 + content_hash: sha256:e1470176436d28416d0ff67d8acc614060fae7b312f86c09b899a92d1c4adfe4 + original_commit: 299e4bd9cb59bd6144084332a7c3fa7bf260769f + original_message: "fix(agentkit): resolve provider config type checks" + original_author: digitallysavvy + base_generation: b66d871314ca0e5929cb9c9095949a7fd5e856a7 + files: + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/vendors/llm.py + - src/agora_agent/agentkit/vendors/mllm.py + - src/agora_agent/agentkit/vendors/stt.py + patch_content: |+ + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index 6275f04..ecf01c6 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -57,6 +57,8 @@ from ..agents.types.start_agents_request_properties_filler_words_content import + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + +from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + +from ..types.tts import Tts + from ..types.asr import Asr + from ..types.llm import Llm + from ..types.llm_style import LlmStyle as GeneratedLlmStyle + @@ -536,6 +538,23 @@ class Agent: + ) + return new_agent + + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + + """Returns a new Agent with the specified RTC audio scenario.""" + + new_agent = self._clone() + + if new_agent._parameters is None: + + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + + elif isinstance(new_agent._parameters, dict): + + new_agent._parameters = typing.cast( + + SessionParamsInput, + + {**new_agent._parameters, "audio_scenario": audio_scenario}, + + ) + + else: + + new_agent._parameters = self._copy_model_update( + + new_agent._parameters, + + {"audio_scenario": audio_scenario}, + + ) + + return new_agent + + + def with_failure_message(self, message: str) -> "Agent": + """Deprecated. Configure the failure message on the LLM or MLLM vendor instead.""" + new_agent = self._clone() + diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py + index 9156a01..5dd822d 100644 + --- a/src/agora_agent/agentkit/vendors/llm.py + +++ b/src/agora_agent/agentkit/vendors/llm.py + @@ -1,7 +1,10 @@ + -from typing import Any, Dict, List, Optional + +from typing import Any, Dict, List, Optional, Union + + from pydantic import BaseModel, ConfigDict, Field, model_validator + + +from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + + StartAgentsRequestPropertiesLlmGreetingConfigs, + +) + from .base import BaseLLM + + LlmGreetingConfigs = Dict[str, Any] + diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py + index 236a494..6a260d8 100644 + --- a/src/agora_agent/agentkit/vendors/mllm.py + +++ b/src/agora_agent/agentkit/vendors/mllm.py + @@ -1,3 +1,4 @@ + +import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + diff --git a/src/agora_agent/agentkit/vendors/stt.py b/src/agora_agent/agentkit/vendors/stt.py + index e5117b0..bb222a9 100644 + --- a/src/agora_agent/agentkit/vendors/stt.py + +++ b/src/agora_agent/agentkit/vendors/stt.py + @@ -89,6 +89,7 @@ class SpeechmaticsSTTOptions(BaseModel): + + api_key: str = Field(..., description="Speechmatics API key") + language: str = Field(..., description="Language code (e.g., en, es, fr)") + + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") + model: Optional[str] = Field(default=None, description="Model name") + uri: Optional[str] = Field(default=None, description="Speechmatics streaming WebSocket URL") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + @@ -124,6 +125,7 @@ class DeepgramSTTOptions(BaseModel): + api_key: Optional[str] = Field(default=None, description="Deepgram API key") + model: Optional[str] = Field(default=None, description="Model (e.g., nova-2, enhanced, base)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en-US)") + + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") + smart_format: Optional[bool] = Field(default=None, description="Enable smart formatting") + punctuation: Optional[bool] = Field(default=None, description="Enable punctuation") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + @@ -353,6 +355,7 @@ class SarvamSTTOptions(BaseModel): + + api_key: str = Field(..., description="Sarvam API key") + language: str = Field(..., description="Language code (e.g., en, hi, ta)") + + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") + model: Optional[str] = Field(default=None, description="Model name") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + theirs_snapshot: + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + import warnings + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.list_agents_response_data_list_item_status import ListAgentsResponseDataListItemStatus + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..types.asr import Asr + from ..types.llm import Llm + from ..types.llm_style import LlmStyle as GeneratedLlmStyle + from ..types.mllm import Mllm + from ..types.mllm_turn_detection import MllmTurnDetection + from ..types.mllm_turn_detection_mode import MllmTurnDetectionMode as GeneratedMllmTurnDetectionMode + from ..types.mllm_vendor import MllmVendor as GeneratedMllmVendor + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse, + ) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = Llm + LlmStyle = GeneratedLlmStyle + SttConfig = Asr + AsrConfig = SttConfig + SttVendor = typing.Any + TtsConfig = Tts + MllmConfig = Mllm + MllmVendor = GeneratedMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = MllmTurnDetection + MllmTurnDetectionMode = GeneratedMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + SessionStatus = ListAgentsResponseDataListItemStatus + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + + class ThinkOptions(typing_extensions.TypedDict, total=False): + on_listening_action: AgentThinkAgentManagementRequestOnListeningAction + on_thinking_action: AgentThinkAgentManagementRequestOnThinkingAction + on_speaking_action: AgentThinkAgentManagementRequestOnSpeakingAction + interruptable: bool + metadata: typing.Dict[str, str] + + + class GetTurnsOptions(typing_extensions.TypedDict, total=False): + page_index: int + page_size: int + + + class SayOptions(typing_extensions.TypedDict, total=False): + priority: SpeakAgentsRequestPriority + interruptable: bool + + + class SessionOptions(typing_extensions.TypedDict, total=False): + name: str + channel: str + token: str + agent_uid: str + remote_uids: typing.List[str] + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + # LLM sub-type aliases + LlmGreetingConfigs = typing.Dict[str, typing.Any] + LlmGreetingConfigsMode = typing.Any + McpServersItem = typing.Dict[str, typing.Any] + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + # Think type aliases and response + ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + ThinkResponse = AgentThinkAgentManagementResponse + + from .token import generate_convo_ai_token, _parse_numeric_uid, _validate_expires_in + + TurnDetectionLanguage = typing_extensions.Literal[ + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", + ] + + DEFAULT_TURN_DETECTION_LANGUAGE: TurnDetectionLanguage = "en-US" + TURN_DETECTION_LANGUAGE_VALUES: typing.Tuple[TurnDetectionLanguage, ...] = ( + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", + ) + _TURN_DETECTION_LANGUAGES = set(TURN_DETECTION_LANGUAGE_VALUES) + + + def _dump_optional_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + + def _is_turn_detection_language(value: typing.Any) -> bool: + return isinstance(value, str) and value in _TURN_DETECTION_LANGUAGES + + + def _validate_turn_detection_language(value: typing.Any) -> TurnDetectionLanguage: + if not _is_turn_detection_language(value): + raise ValueError(f"Invalid interaction language: {value}") + return value # type: ignore[return-value] + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Deprecated: + The Agent-level ``instructions``, ``greeting``, ``failure_message``, + ``max_history``, and ``greeting_configs`` convenience fields are kept + for compatibility. Configure those values on the LLM or MLLM vendor + instead. + + Examples + -------- + >>> from agora_agent import Agent, OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + greeting_configs: typing.Optional[LlmGreetingConfigs] = None, + pipeline_id: typing.Optional[str] = None, + ): + self._name = name + self._pipeline_id = pipeline_id + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + self._greeting_configs = greeting_configs + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + """Deprecated. Configure system messages on the LLM vendor instead.""" + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + """Deprecated. Configure the greeting on the LLM or MLLM vendor instead.""" + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_greeting_configs(self, configs: LlmGreetingConfigs) -> "Agent": + """Deprecated. Configure greeting playback on the LLM vendor instead.""" + new_agent = self._clone() + new_agent._greeting_configs = configs + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Deprecated. Configure the failure message on the LLM or MLLM vendor instead.""" + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Deprecated. Configure max history on the LLM vendor instead.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def pipeline_id(self) -> typing.Optional[str]: + """Published AI Studio pipeline ID used as this agent's base configuration.""" + return self._pipeline_id + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def greeting_configs(self) -> typing.Optional[LlmGreetingConfigs]: + return self._greeting_configs + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "pipeline_id": self._pipeline_id, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + "greeting_configs": self._greeting_configs, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + skip_vendor_validation_categories: typing.Optional[typing.AbstractSet[str]] = None, + allow_missing_vendor_categories: typing.Optional[typing.AbstractSet[str]] = None, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + uid=_parse_numeric_uid(agent_uid, "agent_uid"), + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + warnings.warn( + "skip_vendor_validation is deprecated and will be removed in a future release. " + "Use skip_vendor_validation_categories and allow_missing_vendor_categories instead.", + DeprecationWarning, + stacklevel=2, + ) + + skip_categories = set(skip_vendor_validation_categories or ()) + allow_missing_categories = set(allow_missing_vendor_categories or ()) + if skip_vendor_validation: + skip_categories.update({"asr", "llm", "tts"}) + allow_missing_categories.update({"asr", "llm", "tts"}) + + skip_asr_validation = skip_vendor_validation or "asr" in skip_categories + skip_llm_validation = skip_vendor_validation or "llm" in skip_categories + skip_tts_validation = skip_vendor_validation or "tts" in skip_categories + allow_missing_asr = "asr" in allow_missing_categories + allow_missing_llm = "llm" in allow_missing_categories + allow_missing_tts = "tts" in allow_missing_categories + + if not skip_asr_validation and (self._stt is not None or not allow_missing_asr): + base_kwargs["asr"] = self._resolve_asr_config() + base_kwargs["turn_detection"] = self._resolve_turn_detection_config() + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None and not (skip_tts_validation or allow_missing_tts): + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None and not (skip_llm_validation or allow_missing_llm): + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + if self._llm is not None and not skip_llm_validation: + base_kwargs["llm"] = self._resolve_llm_config() + if self._tts is not None and not skip_tts_validation: + base_kwargs["tts"] = self._tts + + return StartAgentsRequestProperties(**base_kwargs) + + def _resolve_llm_config(self) -> typing.Dict[str, typing.Any]: + llm_config = dict(self._llm or {}) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._greeting_configs is not None: + llm_config["greeting_configs"] = _dump_optional_model(self._greeting_configs) + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + return llm_config + + def _resolve_asr_config(self) -> typing.Dict[str, typing.Any]: + asr_config = dict(self._stt or {}) + asr_config.pop("language", None) + if not asr_config: + asr_config["vendor"] = "ares" + return asr_config + + def _resolve_turn_detection_config(self) -> TurnDetectionConfig: + existing_stt_language = self._stt.get("language") if self._stt is not None else None + existing_turn_detection_language = self._field_value(self._turn_detection, "language") + language = ( + existing_turn_detection_language + if existing_turn_detection_language is not None + else existing_stt_language + if _is_turn_detection_language(existing_stt_language) + else DEFAULT_TURN_DETECTION_LANGUAGE + ) + language = _validate_turn_detection_language(language) + if self._turn_detection is None: + return StartAgentsRequestPropertiesTurnDetection(language=language) + if isinstance(self._turn_detection, dict): + return typing.cast(TurnDetectionConfig, {**self._turn_detection, "language": language}) + return self._copy_model_update(self._turn_detection, {"language": language}) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._pipeline_id = self._pipeline_id + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + new_agent._greeting_configs = self._greeting_configs + return new_agent + src/agora_agent/agentkit/vendors/llm.py: | + from typing import Any, Dict, List, Optional, Union + + from pydantic import BaseModel, ConfigDict, Field, model_validator + + from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + StartAgentsRequestPropertiesLlmGreetingConfigs, + ) + from .base import BaseLLM + + LlmGreetingConfigs = Dict[str, Any] + _OPENAI_MANAGED_MODELS = {"gpt-4o-mini", "gpt-4.1-mini", "gpt-5-nano", "gpt-5-mini"} + + + def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Ensure each MCP server has transport set (API requires it). Default to streamable_http.""" + result = [] + for s in servers: + item = dict(s) + if item.get("transport") is None: + item["transport"] = "streamable_http" + result.append(item) + return result + + + def _dump_optional_model(value: Any) -> Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + class OpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: Optional[str] = Field(default=None, description="OpenAI API key") + model: str = Field(..., description="Model name") + base_url: Optional[str] = Field(default=None, description="Custom base URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + max_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + @model_validator(mode="after") + def _validate_byok_params(self) -> "OpenAIOptions": + if not self.model: + raise ValueError("OpenAI requires model") + if self.api_key is not None and self.base_url is None: + raise ValueError("OpenAI requires base_url when api_key is set") + if self.api_key is None and self.base_url is not None: + raise ValueError("OpenAI base_url is only valid when api_key is set") + if self.api_key is None and self.model.strip().lower() not in _OPENAI_MANAGED_MODELS: + raise ValueError("OpenAI requires api_key unless using a supported Agora-managed model") + if self.api_key is None and self.vendor is not None: + raise ValueError("OpenAI Agora-managed mode does not allow vendor") + return self + + class OpenAI(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # model is the default; explicit params entries extend/override it. + # This matches the TS SDK behaviour: { model, ...params }. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + + # Named fields take precedence over anything in the generic params dict. + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.base_url or "https://api.openai.com/v1/chat/completions", + "params": params, + "style": "openai", + "input_modalities": self.options.input_modalities or ["text"], + } + if self.options.api_key is not None: + config["api_key"] = self.options.api_key + if self.options.headers is not None: + config["headers"] = self.options.headers + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class AzureOpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Azure OpenAI API key") + model: str = Field(..., description="Azure deployment model name") + endpoint: str = Field(..., description="Azure endpoint URL") + deployment_name: str = Field(..., description="Azure deployment name") + api_version: str = Field(default="2024-08-01-preview", description="Azure API version") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + max_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class AzureOpenAI(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AzureOpenAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + url = ( + f"{self.options.endpoint}/openai/deployments/" + f"{self.options.deployment_name}/chat/completions" + f"?api-version={self.options.api_version}" + ) + config: Dict[str, Any] = { + "url": url, + "api_key": self.options.api_key, + "vendor": self.options.vendor or "azure", + "style": "openai", + "input_modalities": self.options.input_modalities or ["text"], + } + + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if params: + config["params"] = params + if self.options.headers is not None: + config["headers"] = self.options.headers + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class AnthropicOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anthropic API key") + model: str = Field(..., description="Model name") + url: str = Field(..., description="Anthropic messages endpoint URL") + max_tokens: int = Field(..., gt=0) + temperature: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Dict[str, str] = Field(..., description="Anthropic request headers") + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class Anthropic(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AnthropicOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.url, + "api_key": self.options.api_key, + "params": params, + "headers": self.options.headers, + "style": "anthropic", + "input_modalities": self.options.input_modalities or ["text"], + } + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class GeminiOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google AI API key") + model: str = Field(..., description="Model name") + url: Optional[str] = Field(default=None, description="Custom API endpoint URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_k: Optional[int] = Field(default=None, gt=0) + max_output_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class Gemini(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + if self.options.top_k is not None: + params["top_k"] = self.options.top_k + if self.options.max_output_tokens is not None: + params["max_output_tokens"] = self.options.max_output_tokens + + config: Dict[str, Any] = { + "url": self.options.url or "https://generativelanguage.googleapis.com/v1beta/models", + "api_key": self.options.api_key, + "params": params, + "style": "gemini", + "input_modalities": self.options.input_modalities or ["text"], + } + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class GroqOptions(OpenAIOptions): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Groq API key") + model: str = Field(..., description="Model name") + base_url: str = Field(..., description="Groq-compatible endpoint") + + + class Groq(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = GroqOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config = OpenAI(**_dump_optional_model(self.options)).to_config() + config["url"] = self.options.base_url + return config + + + class CustomLLMOptions(OpenAIOptions): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Custom LLM API key") + base_url: str = Field(..., description="OpenAI-compatible chat completions endpoint") + + + class CustomLLM(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = CustomLLMOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config = OpenAI(**_dump_optional_model(self.options)).to_config() + config["vendor"] = self.options.vendor or "custom" + return config + + + class VertexAILLMOptions(GeminiOptions): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Vertex AI access token or API key") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud location") + + + class VertexAILLM(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAILLMOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + options = _dump_optional_model(self.options) + options.pop("project_id", None) + options.pop("location", None) + config = Gemini(**options).to_config() + params = dict(config["params"]) + params["project_id"] = self.options.project_id + params["location"] = self.options.location + config["params"] = params + return config + + + class AmazonBedrockOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + access_key: str = Field(..., description="AWS access key ID") + secret_key: str = Field(..., description="AWS secret access key") + region: str = Field(..., description="AWS region") + model: str = Field(..., description="Amazon Bedrock model identifier") + max_tokens: Optional[int] = Field(default=None, gt=0) + url: Optional[str] = Field(default=None, description="Amazon Bedrock converse stream endpoint URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + + class AmazonBedrock(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AmazonBedrockOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.url or f"https://bedrock-runtime.{self.options.region}.amazonaws.com/model/{self.options.model}/converse-stream", + "access_key": self.options.access_key, + "secret_key": self.options.secret_key, + "region": self.options.region, + "model": self.options.model, + "params": params, + "style": "bedrock", + "input_modalities": self.options.input_modalities or ["text"], + } + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + return config + + + class DifyOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Dify API key") + url: str = Field(..., description="Dify workflow or chat endpoint") + model: str = Field(..., description="Dify model identifier") + user: Optional[str] = Field(default=None, description="Dify user identifier") + conversation_id: Optional[str] = Field(default=None, description="Dify conversation ID") + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0) + + + class Dify(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = DifyOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.user is not None: + params["user"] = self.options.user + if self.options.conversation_id is not None: + params["conversation_id"] = self.options.conversation_id + + config: Dict[str, Any] = { + "url": self.options.url, + "api_key": self.options.api_key, + "params": params, + "style": "dify", + "input_modalities": self.options.input_modalities or ["text"], + } + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + return config + src/agora_agent/agentkit/vendors/mllm.py: | + import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + + from ...types.mllm_turn_detection import MllmTurnDetection + from .base import BaseMLLM + + MllmTurnDetectionConfig = MllmTurnDetection + + + class OpenAIRealtimeOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="OpenAI API key") + model: Optional[str] = Field(default=None, description="Model name (e.g., gpt-4o-realtime-preview)") + voice: Optional[str] = Field(default=None, description="Voice identifier") + instructions: Optional[str] = Field(default=None, description="System instructions") + input_audio_transcription: Optional[Dict[str, Any]] = Field(default=None, description="Audio transcription settings") + url: Optional[str] = Field(default=None, description="WebSocket URL") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIRealtimeOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config: Dict[str, Any] = { + "vendor": "openai", + "api_key": self.options.api_key, + } + + if self.options.url is not None: + config["url"] = self.options.url + if ( + self.options.model is not None + or self.options.params is not None + or self.options.voice is not None + or self.options.instructions is not None + or self.options.input_audio_transcription is not None + ): + params: Dict[str, Any] = {} + if self.options.model is not None: + params["model"] = self.options.model + if self.options.params is not None: + params.update(self.options.params) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.input_audio_transcription is not None: + params["input_audio_transcription"] = self.options.input_audio_transcription + config["params"] = params + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + # xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + # is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + class XaiGrokOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="xAI API key") + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + class XaiGrok(BaseMLLM): + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + def __init__(self, **kwargs: Any): + self.options = XaiGrokOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.language is not None: + params["language"] = self.options.language + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate + + config: Dict[str, Any] = { + "vendor": "xai", + "api_key": self.options.api_key, + "url": self.options.url, + "params": params, + } + + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + model: str = Field(..., description="Model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud location/region") + adc_credentials_string: str = Field(..., description="Application Default Credentials JSON string") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name (e.g., Aoede, Charon)") + affective_dialog: Optional[bool] = Field(default=None, description="Enable affective dialog") + proactive_audio: Optional[bool] = Field(default=None, description="Enable proactive audio") + transcribe_agent: Optional[bool] = Field(default=None, description="Transcribe agent speech") + transcribe_user: Optional[bool] = Field(default=None, description="Transcribe user speech") + http_options: Optional[Dict[str, Any]] = Field(default=None, description="HTTP options") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # additional_params spread first so that explicit fields always win, + # matching the TypeScript SDK. + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["model"] = self.options.model + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.affective_dialog is not None: + params["affective_dialog"] = self.options.affective_dialog + if self.options.proactive_audio is not None: + params["proactive_audio"] = self.options.proactive_audio + if self.options.transcribe_agent is not None: + params["transcribe_agent"] = self.options.transcribe_agent + if self.options.transcribe_user is not None: + params["transcribe_user"] = self.options.transcribe_user + if self.options.http_options is not None: + params["http_options"] = self.options.http_options + + config: Dict[str, Any] = { + "vendor": "vertexai", + "project_id": self.options.project_id, + "location": self.options.location, + "adc_credentials_string": self.options.adc_credentials_string, + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class GeminiLiveOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google API key") + model: str = Field(..., description="Gemini Live model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name") + affective_dialog: Optional[bool] = Field(default=None, description="Enable affective dialog") + proactive_audio: Optional[bool] = Field(default=None, description="Enable proactive audio") + transcribe_agent: Optional[bool] = Field(default=None, description="Transcribe agent speech") + transcribe_user: Optional[bool] = Field(default=None, description="Transcribe user speech") + http_options: Optional[Dict[str, Any]] = Field(default=None, description="HTTP options") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiLiveOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = {} + if self.options.additional_params is not None: + params.update(self.options.additional_params) + params["model"] = self.options.model + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.affective_dialog is not None: + params["affective_dialog"] = self.options.affective_dialog + if self.options.proactive_audio is not None: + params["proactive_audio"] = self.options.proactive_audio + if self.options.transcribe_agent is not None: + params["transcribe_agent"] = self.options.transcribe_agent + if self.options.transcribe_user is not None: + params["transcribe_user"] = self.options.transcribe_user + if self.options.http_options is not None: + params["http_options"] = self.options.http_options + + config: Dict[str, Any] = { + "vendor": "gemini", + "api_key": self.options.api_key, + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + src/agora_agent/agentkit/vendors/stt.py: | + from typing import Any, Dict, Optional, Tuple + + from pydantic import BaseModel, ConfigDict, Field, model_validator + from typing_extensions import Literal + + from .base import BaseSTT + + TurnDetectionLanguage = Literal[ + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", + ] + + TURN_DETECTION_LANGUAGE_VALUES: Tuple[TurnDetectionLanguage, ...] = ( + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", + ) + _TURN_DETECTION_LANGUAGES = set(TURN_DETECTION_LANGUAGE_VALUES) + _DEEPGRAM_MANAGED_MODELS = {"nova-2", "nova-3"} + + + def _turn_detection_language(language: Optional[str]) -> Optional[TurnDetectionLanguage]: + if language in _TURN_DETECTION_LANGUAGES: + return language # type: ignore[return-value] + return None + + + class SpeechmaticsSTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Speechmatics API key") + language: str = Field(..., description="Language code (e.g., en, es, fr)") + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") + model: Optional[str] = Field(default=None, description="Model name") + uri: Optional[str] = Field(default=None, description="Speechmatics streaming WebSocket URL") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class SpeechmaticsSTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = SpeechmaticsSTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ + "api_key": self.options.api_key, + "language": self.options.language, + }) + if self.options.model is not None: + params["model"] = self.options.model + if self.options.uri is not None: + params["uri"] = self.options.uri + + config: Dict[str, Any] = { + "vendor": "speechmatics", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + + + class DeepgramSTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: Optional[str] = Field(default=None, description="Deepgram API key") + model: Optional[str] = Field(default=None, description="Model (e.g., nova-2, enhanced, base)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en-US)") + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") + smart_format: Optional[bool] = Field(default=None, description="Enable smart formatting") + punctuation: Optional[bool] = Field(default=None, description="Enable punctuation") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + @model_validator(mode="after") + def _validate_managed_model(self) -> "DeepgramSTTOptions": + if self.api_key is None and (self.model is None or self.model.strip().lower() not in _DEEPGRAM_MANAGED_MODELS): + raise ValueError("DeepgramSTT requires api_key unless using a supported Agora-managed model") + return self + + class DeepgramSTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = DeepgramSTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + + if self.options.api_key is not None: + params["key"] = self.options.api_key + if self.options.model is not None: + params["model"] = self.options.model + if self.options.language is not None: + params["language"] = self.options.language + if self.options.smart_format is not None: + params["smart_format"] = self.options.smart_format + if self.options.punctuation is not None: + params["punctuation"] = self.options.punctuation + config: Dict[str, Any] = { + "vendor": "deepgram", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + + + class MicrosoftSTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + key: str = Field(..., description="Azure subscription key") + region: str = Field(..., description="Azure region (e.g., eastus)") + language: str = Field(..., description="Language code (e.g., en-US)") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class MicrosoftSTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = MicrosoftSTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ + "key": self.options.key, + "region": self.options.region, + }) + if self.options.language is not None: + params["language"] = self.options.language + + config: Dict[str, Any] = { + "vendor": "microsoft", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + + + class OpenAISTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="OpenAI API key") + model: Optional[str] = Field(default=None, description="Model (default: whisper-1)") + language: Optional[str] = Field(default=None, description="Language code") + prompt: Optional[str] = Field(default=None, description="Prompt that guides OpenAI transcription") + input_audio_transcription: Optional[Dict[str, Any]] = Field(default=None, description="OpenAI transcription settings") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class OpenAISTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = OpenAISTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["api_key"] = self.options.api_key + + transcription = {"model": "whisper-1", **(self.options.input_audio_transcription or {})} + if self.options.model is not None: + transcription["model"] = self.options.model + if self.options.prompt is not None: + transcription["prompt"] = self.options.prompt + if self.options.language is not None: + transcription["language"] = self.options.language + params["input_audio_transcription"] = transcription + + config: Dict[str, Any] = { + "vendor": "openai", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + + + class GoogleSTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud region") + adc_credentials_string: str = Field(..., description="Google service account credentials JSON string") + language: str = Field(..., description="Language code (e.g., en-US)") + model: Optional[str] = Field(default=None, description="Recognition model") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class GoogleSTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = GoogleSTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ + "project_id": self.options.project_id, + "location": self.options.location, + "adc_credentials_string": self.options.adc_credentials_string, + }) + + if self.options.language is not None: + params["language"] = self.options.language + if self.options.model is not None: + params["model"] = self.options.model + + config: Dict[str, Any] = { + "vendor": "google", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + + + class AmazonSTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + access_key: str = Field(..., description="AWS Access Key ID") + secret_key: str = Field(..., description="AWS Secret Access Key") + region: str = Field(..., description="AWS region (e.g., us-east-1)") + language: str = Field(..., description="Language code") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class AmazonSTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = AmazonSTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ + "access_key_id": self.options.access_key, + "secret_access_key": self.options.secret_key, + "region": self.options.region, + }) + if self.options.language is not None: + params["language_code"] = self.options.language + + config: Dict[str, Any] = { + "vendor": "amazon", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + + + class AssemblyAISTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="AssemblyAI API key") + language: str = Field(..., description="Language code") + uri: Optional[str] = Field(default=None, description="AssemblyAI streaming WebSocket URL") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class AssemblyAISTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = AssemblyAISTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["api_key"] = self.options.api_key + if self.options.language is not None: + params["language"] = self.options.language + if self.options.uri is not None: + params["uri"] = self.options.uri + + config: Dict[str, Any] = { + "vendor": "assemblyai", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + + + class AresSTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + language: Optional[TurnDetectionLanguage] = Field(default=None, description="Language code") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class AresSTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = AresSTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config: Dict[str, Any] = {"vendor": "ares"} + if self.options.language is not None: + config["language"] = self.options.language + if self.options.additional_params: + config["params"] = self.options.additional_params + return config + + + class SarvamSTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Sarvam API key") + language: str = Field(..., description="Language code (e.g., en, hi, ta)") + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") + model: Optional[str] = Field(default=None, description="Model name") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class SarvamSTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = SarvamSTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ + "api_key": self.options.api_key, + "language": self.options.language, + }) + if self.options.model is not None: + params["model"] = self.options.model + + config: Dict[str, Any] = { + "vendor": "sarvam", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + - id: patch-617ee134 + content_hash: sha256:ea2d27ba8019bf09ce5766d322eb7218fcee0a90124e823ba16c4e45dc1af5a9 + original_commit: 617ee134d9dafbf4f4f83d5e98b80ad110c6e1bf + original_message: "feat(agentkit): support agent-level pipeline_id" + original_author: Hermes (agora) + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/reference/agent.md + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + - tests/custom/test_pipeline_id.py + patch_content: | + diff --git a/docs/reference/agent.md b/docs/reference/agent.md + index 187229f..86d4fbd 100644 + --- a/docs/reference/agent.md + +++ b/docs/reference/agent.md + @@ -27,12 +27,14 @@ Agent( + labels: Optional[Dict[str, str]] = None, + rtc: Optional[RtcConfig] = None, + filler_words: Optional[FillerWordsConfig] = None, + + pipeline_id: Optional[str] = None, + ) + ``` + + | Parameter | Type | Default | Description | + |---|---|---|---| + | `name` | `Optional[str]` | `None` | Agent name, used as default session name | + +| `pipeline_id` | `Optional[str]` | `None` | Published AI Studio pipeline ID used as this agent's base configuration | + | `instructions` | `Optional[str]` | `None` | Deprecated. Use LLM vendor `system_messages` instead. | + | `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Interaction language and turn detection configuration | + | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | + @@ -47,6 +49,8 @@ Agent( + | `rtc` | `Optional[RtcConfig]` | `None` | RTC media encryption | + | `filler_words` | `Optional[FillerWordsConfig]` | `None` | Filler words while waiting for LLM | + + +`pipeline_id` is an AI Studio base configuration. Explicit Agent config such as `with_llm()`, `with_tts()`, `with_stt()`, `with_mllm()`, `advanced_features`, and other builder options may send fields in `properties` that override the saved pipeline settings. Session-level `pipeline_id` overrides the agent-level value. + + + The Agent-level `instructions`, `greeting`, `failure_message`, `max_history`, and `greeting_configs` fields are compatibility shims. New code should configure those values on the LLM or MLLM vendor because that matches the core request schema. + + ## Builder Methods + @@ -202,6 +206,8 @@ create_session( + token: Optional[str] = None, + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + + preset: Optional[Union[str, Sequence[str]]] = None, + + pipeline_id: Optional[str] = None, + expires_in: Optional[int] = None, + ) -> AgentSession + ``` + @@ -219,6 +225,10 @@ Creates an `AgentSession` bound to the given client and channel. + | `expires_in` | `Optional[int]` | No | Token lifetime in seconds (default: `86400` = 24 h, Agora max). Only applies when the token is auto-generated. Use `expires_in_hours()` or `expires_in_minutes()` for clarity. Valid range: 1–86400. | + | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | + | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | + +| `preset` | `Optional[Union[str, Sequence[str]]]` | No | Advanced preset value for project-specific routing | + +| `pipeline_id` | `Optional[str]` | No | Published AI Studio pipeline ID for this session. Overrides `agent.pipeline_id`. | + + + +`pipeline_id` is sent as the top-level `/join` field `pipeline_id`, not inside `properties`. + + **Returns:** `AgentSession` + + diff --git a/docs/reference/session.md b/docs/reference/session.md + index 63402f6..76e1367 100644 + --- a/docs/reference/session.md + +++ b/docs/reference/session.md + @@ -33,6 +33,11 @@ AgentSession( + token: Optional[str] = None, + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + + preset: Optional[Union[str, Sequence[str]]] = None, + + pipeline_id: Optional[str] = None, + + expires_in: Optional[int] = None, + + debug: Optional[bool] = None, + + warn: Optional[Callable[[str], None]] = None, + ) + ``` + + @@ -51,6 +56,13 @@ AgentSession( + | `token` | `Optional[str]` | No | Pre-built RTC token | + | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | + | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | + +| `preset` | `Optional[Union[str, Sequence[str]]]` | No | Advanced preset value for project-specific routing | + +| `pipeline_id` | `Optional[str]` | No | Published AI Studio pipeline ID for this session. Overrides `agent.pipeline_id`. | + +| `expires_in` | `Optional[int]` | No | Auto-generated token lifetime in seconds | + +| `debug` | `Optional[bool]` | No | Enable debug logging of the start request | + +| `warn` | `Optional[Callable[[str], None]]` | No | Custom warning sink | + + + +`pipeline_id` is sent as the top-level `/join` field `pipeline_id`, not inside `properties`. If unset, `AgentSession.start()` uses the agent-level value from `Agent(..., pipeline_id=...)`. + + ## Methods + + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index fea1f0d..0a652db 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -343,8 +343,10 @@ class Agent: + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + greeting_configs: typing.Optional[LlmGreetingConfigs] = None, + + pipeline_id: typing.Optional[str] = None, + ): + self._name = name + + self._pipeline_id = pipeline_id + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + @@ -609,6 +611,11 @@ class Agent: + def name(self) -> typing.Optional[str]: + return self._name + + + @property + + def pipeline_id(self) -> typing.Optional[str]: + + """Published AI Studio pipeline ID used as this agent's base configuration.""" + + return self._pipeline_id + + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + @@ -693,6 +700,7 @@ class Agent: + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + + "pipeline_id": self._pipeline_id, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + @@ -945,6 +953,7 @@ class Agent: + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + + new_agent._pipeline_id = self._pipeline_id + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index e113dc1..5c866ac 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -52,7 +52,8 @@ class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + + Optional fields + --------------- + - app_certificate, token, idle_timeout, enable_string_uid, expires_in + + app_certificate, token, idle_timeout, enable_string_uid, preset, + + pipeline_id, expires_in, debug, warn + """ + + app_certificate: str + @@ -290,14 +291,18 @@ class _AgentSessionBase: + return True + return mllm is not None + + - def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + + def _build_start_properties( + + self, + + token_opts: typing.Dict[str, typing.Any], + + skip_vendor_validation: bool, + + ) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + - skip_vendor_validation=True, + + skip_vendor_validation=skip_vendor_validation, + **token_opts, + ) + properties = self._dump_model(base_properties) + @@ -445,6 +450,7 @@ class AgentSession(_AgentSessionBase): + self._status = "starting" + + try: + + pipeline_id = self._pipeline_id if self._pipeline_id is not None else self._agent.pipeline_id + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + @@ -454,7 +460,7 @@ class AgentSession(_AgentSessionBase): + "expires_in": self._expires_in, + } + + - properties = self._build_start_properties(token_opts) + + properties = self._build_start_properties(token_opts, skip_vendor_validation=bool(self._preset or pipeline_id)) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + @@ -466,7 +472,7 @@ class AgentSession(_AgentSessionBase): + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + - "pipeline_id": self._pipeline_id, + + "pipeline_id": pipeline_id, + "properties": resolved_properties, + }) + + @@ -480,7 +486,7 @@ class AgentSession(_AgentSessionBase): + name=self._name, + properties=request_properties, + preset=resolved_preset, + - pipeline_id=self._pipeline_id, + + pipeline_id=pipeline_id, + request_options=self._request_options(), + ) + + @@ -766,6 +772,7 @@ class AsyncAgentSession(_AgentSessionBase): + self._status = "starting" + + try: + + pipeline_id = self._pipeline_id if self._pipeline_id is not None else self._agent.pipeline_id + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + @@ -775,7 +782,7 @@ class AsyncAgentSession(_AgentSessionBase): + "expires_in": self._expires_in, + } + + - properties = self._build_start_properties(token_opts) + + properties = self._build_start_properties(token_opts, skip_vendor_validation=bool(self._preset or pipeline_id)) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + @@ -787,7 +794,7 @@ class AsyncAgentSession(_AgentSessionBase): + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + - "pipeline_id": self._pipeline_id, + + "pipeline_id": pipeline_id, + "properties": resolved_properties, + }) + + @@ -801,7 +808,7 @@ class AsyncAgentSession(_AgentSessionBase): + name=self._name, + properties=request_properties, + preset=resolved_preset, + - pipeline_id=self._pipeline_id, + + pipeline_id=pipeline_id, + request_options=self._request_options(), + ) + + diff --git a/tests/custom/test_pipeline_id.py b/tests/custom/test_pipeline_id.py + new file mode 100644 + index 0000000..c6c8c8f + --- /dev/null + +++ b/tests/custom/test_pipeline_id.py + @@ -0,0 +1,123 @@ + +import pytest + + + +from agora_agent import Agent + + + + + +def dump(value): + + if hasattr(value, "model_dump"): + + return value.model_dump(exclude_none=True) + + if hasattr(value, "dict"): + + return value.dict(exclude_none=True) + + return value + + + + + +class StartResponse: + + agent_id = "agent-id" + + + + + +class FakeAgentsClient: + + def __init__(self): + + self.calls = [] + + + + def start(self, appid, **kwargs): + + self.calls.append({"appid": appid, **kwargs}) + + return StartResponse() + + + + + +class FakeAsyncAgentsClient: + + def __init__(self): + + self.calls = [] + + + + async def start(self, appid, **kwargs): + + self.calls.append({"appid": appid, **kwargs}) + + return StartResponse() + + + + + +class FakeClient: + + app_id = "appid" + + app_certificate = None + + + + def __init__(self, agents): + + self.agents = agents + + + + + +def start_agent(agent, **overrides): + + agents = FakeAgentsClient() + + client = FakeClient(agents) + + options = { + + "channel": "channel", + + "token": "token", + + "agent_uid": "1", + + "remote_uids": ["100"], + + **overrides, + + } + + + + agent_id = agent.create_session(client, **options).start() + + + + assert agent_id == "agent-id" + + assert len(agents.calls) == 1 + + return agents.calls[0] + + + + + +def test_agent_pipeline_id_sends_top_level_pipeline_id() -> None: + + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + + + assert call["appid"] == "appid" + + assert call["name"] == "support" + + assert call["pipeline_id"] == "studio-pipeline-id" + + properties = dump(call["properties"]) + + assert properties["channel"] == "channel" + + assert properties["token"] == "token" + + assert properties["agent_rtc_uid"] == "1" + + assert properties["remote_rtc_uids"] == ["100"] + + + + + +def test_session_pipeline_id_overrides_agent_pipeline_id() -> None: + + call = start_agent( + + Agent(name="support", pipeline_id="agent-pipeline"), + + pipeline_id="session-pipeline", + + ) + + + + assert call["pipeline_id"] == "session-pipeline" + + + + + +def test_agent_pipeline_id_skips_missing_vendor_validation() -> None: + + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + + + assert call["pipeline_id"] == "studio-pipeline-id" + + + + + +def test_pipeline_id_is_not_sent_inside_properties() -> None: + + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + + + assert call["pipeline_id"] == "studio-pipeline-id" + + assert "pipeline_id" not in dump(call["properties"]) + + + + + +def test_pipeline_id_survives_builder_clone() -> None: + + agent = Agent(name="support", pipeline_id="studio-pipeline-id").with_tools(True) + + + + assert agent.pipeline_id == "studio-pipeline-id" + + call = start_agent(agent) + + + + assert call["pipeline_id"] == "studio-pipeline-id" + + assert dump(call["properties"])["advanced_features"] == {"enable_tools": True} + + + + + +@pytest.mark.asyncio + +async def test_async_session_uses_agent_pipeline_id() -> None: + + agents = FakeAsyncAgentsClient() + + client = FakeClient(agents) + + agent = Agent(name="support", pipeline_id="studio-pipeline-id") + + + + agent_id = await agent.create_async_session( + + client, + + channel="channel", + + token="token", + + agent_uid="1", + + remote_uids=["100"], + + ).start() + + + + assert agent_id == "agent-id" + + assert agents.calls[0]["pipeline_id"] == "studio-pipeline-id" + + assert "pipeline_id" not in dump(agents.calls[0]["properties"]) + theirs_snapshot: + docs/reference/agent.md: | + --- + sidebar_position: 2 + title: Agent + description: Full API reference for the Python Agent builder class. + --- + + # Agent Reference + + **Import:** `from agora_agent import Agent` + + ## Constructor + + + ```python + Agent( + name: Optional[str] = None, + instructions: Optional[str] = None, + turn_detection: Optional[TurnDetectionConfig] = None, + interruption: Optional[InterruptionConfig] = None, + sal: Optional[SalConfig] = None, + advanced_features: Optional[Dict[str, Any]] = None, + parameters: Optional[SessionParams] = None, + greeting: Optional[str] = None, + failure_message: Optional[str] = None, + max_history: Optional[int] = None, + geofence: Optional[GeofenceConfig] = None, + labels: Optional[Dict[str, str]] = None, + rtc: Optional[RtcConfig] = None, + filler_words: Optional[FillerWordsConfig] = None, + pipeline_id: Optional[str] = None, + ) + ``` + + | Parameter | Type | Default | Description | + |---|---|---|---| + | `name` | `Optional[str]` | `None` | Agent name, used as default session name | + | `pipeline_id` | `Optional[str]` | `None` | Published AI Studio pipeline ID used as this agent's base configuration | + | `instructions` | `Optional[str]` | `None` | Deprecated. Use LLM vendor `system_messages` instead. | + | `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Interaction language and turn detection configuration | + | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | + | `sal` | `Optional[SalConfig]` | `None` | Speech Activity Level configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | `None` | Advanced features dict (e.g., `{'enable_rtm': True}`) | + | `parameters` | `Optional[SessionParams]` | `None` | Additional session parameters | + | `greeting` | `Optional[str]` | `None` | Deprecated. Use LLM/MLLM vendor `greeting_message` instead. | + | `failure_message` | `Optional[str]` | `None` | Deprecated. Use LLM/MLLM vendor `failure_message` instead. | + | `max_history` | `Optional[int]` | `None` | Deprecated. Use LLM vendor `max_history` instead. | + | `geofence` | `Optional[GeofenceConfig]` | `None` | Regional access restriction | + | `labels` | `Optional[Dict[str, str]]` | `None` | Custom key-value labels (returned in callbacks) | + | `rtc` | `Optional[RtcConfig]` | `None` | RTC media encryption | + | `filler_words` | `Optional[FillerWordsConfig]` | `None` | Filler words while waiting for LLM | + + `pipeline_id` is an AI Studio base configuration. Explicit Agent config such as `with_llm()`, `with_tts()`, `with_stt()`, `with_mllm()`, `advanced_features`, and other builder options may send fields in `properties` that override the saved pipeline settings. Session-level `pipeline_id` overrides the agent-level value. + + The Agent-level `instructions`, `greeting`, `failure_message`, `max_history`, and `greeting_configs` fields are compatibility shims. New code should configure those values on the LLM or MLLM vendor because that matches the core request schema. + + ## Builder Methods + + All builder methods return a new `Agent` instance (immutable pattern). + + ### `with_llm(vendor: BaseLLM) -> Agent` + + Set the LLM vendor for cascading flow. + + + ```python + from agora_agent import OpenAI + agent = Agent().with_llm(OpenAI(api_key='your-key', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini')) + ``` + + ### `with_tts(vendor: BaseTTS) -> Agent` + + Set the TTS vendor. Records the vendor's `sample_rate` for avatar validation. + + + ```python + from agora_agent import ElevenLabsTTS + agent = Agent().with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1')) + ``` + + ### `with_stt(vendor: BaseSTT) -> Agent` + + Set the STT (ASR) vendor. + + + ```python + from agora_agent import DeepgramSTT + agent = Agent().with_stt(DeepgramSTT(api_key='your-key', language='en-US')) + ``` + + ### `with_mllm(vendor: BaseMLLM) -> Agent` + + Set the MLLM vendor for multimodal flow. Calling `with_mllm()` automatically sets `mllm.enable = True`. MLLM sessions do not require TTS, STT, or LLM vendors. + + + ```python + from agora_agent import OpenAIRealtime + agent = Agent().with_mllm(OpenAIRealtime(api_key='your-key')) + ``` + + ### `with_avatar(vendor: BaseAvatar) -> Agent` + + Set the avatar vendor for the cascading ASR + LLM + TTS pipeline. Avatars are not supported when MLLM is enabled — combining `with_mllm()` and an enabled `with_avatar()` is rejected at `to_properties()` and `AgentSession.start()`. A disabled avatar (`enable=False`) is allowed alongside MLLM. + + Raises `ValueError` if the TTS sample rate does not match the avatar's `required_sample_rate`. + + + ```python + from agora_agent import HeyGenAvatar + agent = agent.with_avatar(HeyGenAvatar(api_key='your-key', quality='medium', agora_uid='2')) + ``` + + **Raises:** `ValueError` — `"Avatar requires TTS sample rate of {required} Hz, but TTS is configured with {actual} Hz. Please update your TTS sample_rate to {required}."` + + ### `with_turn_detection(config: TurnDetectionConfig) -> Agent` + + Override cascading-flow turn detection settings. Use `language` for the Agora interaction language, `config.start_of_speech` and `config.end_of_speech` for SOS/EOS detection, `with_interruption()` for interruption behavior, and MLLM vendor `turn_detection` for MLLM turn detection. + + Pause-state detection is configured under semantic end-of-speech: + + ```python + agent = agent.with_turn_detection({ + "mode": "default", + "config": { + "end_of_speech": { + "mode": "semantic", + "semantic_config": { + "pause_state_enabled": True, + }, + }, + }, + }) + ``` + + ### `with_interruption(config: InterruptionConfig) -> Agent` + + Configure unified interruption behavior using the top-level `interruption` object. Use this for `start_of_speech` and `keywords` interruption modes. + + ### `with_instructions(instructions: str) -> Agent` + + Deprecated. Configure `system_messages` on the LLM vendor instead. + + ### `with_greeting(greeting: str) -> Agent` + + Deprecated. Configure `greeting_message` on the LLM or MLLM vendor instead. + + ### `with_name(name: str) -> Agent` + + Override the agent name. + + ### `with_sal(config: SalConfig) -> Agent` + + Set SAL (Selective Attention Locking) configuration. + + ### `with_advanced_features(features: AdvancedFeatures) -> Agent` + + Set advanced features (e.g. `{'enable_rtm': True}`). + + When `enable_rtm=True`, AgentKit defaults `parameters.data_channel` to `"rtm"` unless you explicitly set another data channel. + + ### `with_tools(enabled: bool = True) -> Agent` + + Enable or disable MCP tool invocation by setting `advanced_features.enable_tools`. + + ### `with_parameters(parameters: SessionParams) -> Agent` + + Set session parameters (silence config, farewell config, data channel, audio scenario, etc.). + + ### `with_audio_scenario(audio_scenario: ParametersAudioScenario) -> Agent` + + Set `parameters.audio_scenario` without replacing existing session parameters. + + ### `with_failure_message(message: str) -> Agent` + + Deprecated. Configure `failure_message` on the LLM or MLLM vendor instead. + + ### `with_max_history(max_history: int) -> Agent` + + Deprecated. Configure `max_history` on the LLM vendor instead. + + ### `with_geofence(geofence: GeofenceConfig) -> Agent` + + Set geofence configuration (restricts backend server regions). + + ### `with_labels(labels: Dict[str, str]) -> Agent` + + Set custom labels (key-value pairs returned in notification callbacks). + + ### `with_rtc(rtc: RtcConfig) -> Agent` + + Set RTC configuration. + + ### `with_filler_words(filler_words: FillerWordsConfig) -> Agent` + + Set filler words configuration (played while waiting for LLM response). + + ## `create_session()` + + + ```python + create_session( + client: Any, + channel: str, + agent_uid: str, + remote_uids: List[str], + name: Optional[str] = None, + token: Optional[str] = None, + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + preset: Optional[Union[str, Sequence[str]]] = None, + pipeline_id: Optional[str] = None, + expires_in: Optional[int] = None, + ) -> AgentSession + ``` + + Creates an `AgentSession` bound to the given client and channel. + + | Parameter | Type | Required | Description | + |---|---|---|---| + | `client` | `Agora` or `AsyncAgora` | Yes | Authenticated client | + | `channel` | `str` | Yes | Channel name | + | `agent_uid` | `str` | Yes | UID for the agent | + | `remote_uids` | `List[str]` | Yes | UIDs of remote participants | + | `name` | `Optional[str]` | No | Session name (defaults to agent name) | + | `token` | `Optional[str]` | No | Pre-built RTC+RTM token | + | `expires_in` | `Optional[int]` | No | Token lifetime in seconds (default: `86400` = 24 h, Agora max). Only applies when the token is auto-generated. Use `expires_in_hours()` or `expires_in_minutes()` for clarity. Valid range: 1–86400. | + | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | + | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | + | `preset` | `Optional[Union[str, Sequence[str]]]` | No | Advanced preset value for project-specific routing | + | `pipeline_id` | `Optional[str]` | No | Published AI Studio pipeline ID for this session. Overrides `agent.pipeline_id`. | + + `pipeline_id` is sent as the top-level `/join` field `pipeline_id`, not inside `properties`. + + **Returns:** `AgentSession` + + ## `to_properties()` + + Converts the agent configuration into a `StartAgentsRequestProperties` object for the Agora API. Called internally by `AgentSession.start()`. + + + ```python + to_properties( + channel: str, + agent_uid: str, + remote_uids: List[str], + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + token: Optional[str] = None, + app_id: Optional[str] = None, + app_certificate: Optional[str] = None, + expires_in: Optional[int] = None, + ) -> StartAgentsRequestProperties + ``` + + **Raises:** `ValueError` if neither `token` nor `app_id`+`app_certificate` is provided, or if required vendors (LLM, TTS) are missing in cascading mode. + + ## Properties + + | Property | Type | Description | + |---|---|---| + | `name` | `Optional[str]` | Agent name | + | `instructions` | `Optional[str]` | Deprecated Agent-level system prompt | + | `greeting` | `Optional[str]` | Deprecated Agent-level greeting message | + | `failure_message` | `Optional[str]` | Deprecated Agent-level failure message | + | `max_history` | `Optional[int]` | Deprecated Agent-level max history | + | `llm` | `Optional[Dict[str, Any]]` | LLM config dict (from `to_config()`) | + | `tts` | `Optional[Dict[str, Any]]` | TTS config dict | + | `stt` | `Optional[Dict[str, Any]]` | STT config dict | + | `mllm` | `Optional[Dict[str, Any]]` | MLLM config dict | + | `avatar` | `Optional[Dict[str, Any]]` | Avatar config dict | + | `turn_detection` | `Optional[TurnDetectionConfig]` | Interaction language and turn detection settings | + | `sal` | `Optional[SalConfig]` | SAL configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | Advanced features | + | `parameters` | `Optional[SessionParams]` | Session parameters | + | `geofence` | `Optional[GeofenceConfig]` | Geofence configuration | + | `labels` | `Optional[Dict[str, str]]` | Custom labels | + | `rtc` | `Optional[RtcConfig]` | RTC configuration | + | `filler_words` | `Optional[FillerWordsConfig]` | Filler words configuration | + | `config` | `Dict[str, Any]` | Full configuration dict | + + ## Type aliases + + Public aliases over Fern-generated types: `LlmConfig`, `SttConfig`, `AsrConfig` (= `SttConfig`), `MllmConfig`, `AvatarConfig`, session/conversation types, and think types (`ThinkOnListeningAction`, etc.). + + Think value constants: `ThinkOnListeningActionInject`, `ThinkOnListeningActionInterrupt`, `ThinkOnListeningActionIgnore`, `ThinkOnThinkingActionInterrupt`, `ThinkOnThinkingActionIgnore`, `ThinkOnSpeakingActionInterrupt`, `ThinkOnSpeakingActionIgnore`. + docs/reference/session.md: | + --- + sidebar_position: 3 + title: AgentSession + description: Full API reference for the Python AgentSession class. + --- + + # AgentSession / AsyncAgentSession Reference + + **Import:** + + ```python + from agora_agent import AgentSession + from agora_agent import AsyncAgentSession + # or from top-level: + from agora_agent import AgentSession, AsyncAgentSession + ``` + + ## Constructor + + Sessions are normally created via `Agent.create_session()`. Direct construction is available for advanced use: + + + ```python + AgentSession( + client: Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: List[str], + app_certificate: Optional[str] = None, + token: Optional[str] = None, + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + preset: Optional[Union[str, Sequence[str]]] = None, + pipeline_id: Optional[str] = None, + expires_in: Optional[int] = None, + debug: Optional[bool] = None, + warn: Optional[Callable[[str], None]] = None, + ) + ``` + + `AsyncAgentSession` has the same constructor signature. + + | Parameter | Type | Required | Description | + |---|---|---|---| + | `client` | `Agora` or `AsyncAgora` | Yes | Authenticated client | + | `agent` | `Agent` | Yes | Agent configuration | + | `app_id` | `str` | Yes | Agora App ID | + | `name` | `str` | Yes | Session name | + | `channel` | `str` | Yes | Channel name | + | `agent_uid` | `str` | Yes | UID for the agent | + | `remote_uids` | `List[str]` | Yes | UIDs of remote participants | + | `app_certificate` | `Optional[str]` | No | App Certificate (for auto token generation) | + | `token` | `Optional[str]` | No | Pre-built RTC token | + | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | + | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | + | `preset` | `Optional[Union[str, Sequence[str]]]` | No | Advanced preset value for project-specific routing | + | `pipeline_id` | `Optional[str]` | No | Published AI Studio pipeline ID for this session. Overrides `agent.pipeline_id`. | + | `expires_in` | `Optional[int]` | No | Auto-generated token lifetime in seconds | + | `debug` | `Optional[bool]` | No | Enable debug logging of the start request | + | `warn` | `Optional[Callable[[str], None]]` | No | Custom warning sink | + + `pipeline_id` is sent as the top-level `/join` field `pipeline_id`, not inside `properties`. If unset, `AgentSession.start()` uses the agent-level value from `Agent(..., pipeline_id=...)`. + + ## Methods + + ### `start()` + + Start the agent session. Generates an RTC token if not provided, validates avatar/TTS config for cascading sessions, and calls the Agora API. MLLM sessions do not require TTS; an enabled avatar is rejected when MLLM is configured (a disabled avatar is allowed). + + | | Sync (`AgentSession`) | Async (`AsyncAgentSession`) | + |---|---|---| + | **Signature** | `start() -> str` | `async start() -> str` | + | **Returns** | Agent ID | Agent ID | + | **Raises** | `RuntimeError` if not in `idle`, `stopped`, or `error` state | Same | + | **Raises** | `ValueError` if avatar/TTS sample rate mismatch or an enabled avatar is used with MLLM | Same | + + + ```python + # Sync + agent_id = session.start() + + # Async + agent_id = await session.start() + ``` + + ### `stop()` + + Stop the agent session. If the agent has already stopped (404 from API), transitions to `stopped` without raising. + + | | Sync | Async | + |---|---|---| + | **Signature** | `stop() -> None` | `async stop() -> None` | + | **Raises** | `RuntimeError` if not in `running` state | Same | + + + ```python + # Sync + session.stop() + + # Async + await session.stop() + ``` + + ### `say(text, priority=None, interruptable=None)` + + Send text to be spoken by the agent's TTS. + + | | Sync | Async | + |---|---|---| + | **Signature** | `say(text: str, priority: Optional[str] = None, interruptable: Optional[bool] = None) -> None` | Same with `async` | + | **Raises** | `RuntimeError` if not in `running` state | Same | + + | Parameter | Type | Required | Description | + |---|---|---|---| + | `text` | `str` | Yes | Text to speak | + | `priority` | `str` | No | `INTERRUPT`, `APPEND`, or `IGNORE` | + | `interruptable` | `bool` | No | Whether the message can be interrupted | + + + ```python + # Sync + session.say('Hello!', priority='INTERRUPT', interruptable=False) + + # Async + await session.say('Hello!', priority='INTERRUPT', interruptable=False) + ``` + + ### `interrupt()` + + Interrupt the agent while speaking or thinking. + + | | Sync | Async | + |---|---|---| + | **Signature** | `interrupt() -> None` | `async interrupt() -> None` | + | **Raises** | `RuntimeError` if not in `running` state | Same | + + + ```python + # Sync + session.interrupt() + + # Async + await session.interrupt() + ``` + + ### `update(properties)` + + Update the agent configuration at runtime. + + | | Sync | Async | + |---|---|---| + | **Signature** | `update(properties: Any) -> None` | `async update(properties: Any) -> None` | + | **Raises** | `RuntimeError` if not in `running` state | Same | + + + ```python + from agora_agent.agents.types import UpdateAgentsRequestProperties + + # Sync + session.update(properties) + + # Async + await session.update(properties) + ``` + + ### `think(text, ...)` + + Inject a custom text instruction into the running agent. + + In API v2.7, omitting `on_listening_action` uses the server default `interrupt`. Pass `on_listening_action='inject'` explicitly to preserve the pre-v2.7 behavior. + + ```python + session.think('Summarize the last answer', on_listening_action='inject') + ``` + + ### `get_history()` + + Retrieve the conversation history. + + | | Sync | Async | + |---|---|---| + | **Signature** | `get_history() -> Any` | `async get_history() -> Any` | + | **Raises** | `RuntimeError` if no agent ID | Same | + + + ```python + # Sync + history = session.get_history() + + # Async + history = await session.get_history() + ``` + + ### `get_info()` + + Retrieve the current session info. + + | | Sync | Async | + |---|---|---| + | **Signature** | `get_info() -> Any` | `async get_info() -> Any` | + | **Raises** | `RuntimeError` if no agent ID | Same | + + + ```python + # Sync + info = session.get_info() + + # Async + info = await session.get_info() + ``` + + ### `get_turns(page_index=None, page_size=None)` + + Retrieve paginated turn analytics for a completed or running session. In v2.7, the API defaults to page 1 and up to 50 turns per page. Responses include `agent_id`, `name`, `channel`, `total_turn_count`, `pagination`, and `turns`. + + ```python + page = session.get_turns(page_index=1, page_size=50) + ``` + + ### `get_all_turns(page_size=None)` + + Fetch all turn pages and return a single `GetTurnsAgentsResponse` with the combined `turns` list. + + ```python + all_turns = session.get_all_turns(page_size=50) + ``` + + ### `on(event, handler)` + + Register an event handler. This method is synchronous on both `AgentSession` and `AsyncAgentSession`. + + + ```python + session.on('started', lambda data: print(f'Started: {data}')) + ``` + + | Parameter | Type | Description | + |---|---|---| + | `event` | `str` | Event type: `started`, `stopped`, or `error` | + | `handler` | `Callable[..., None]` | Callback function | + + ### `off(event, handler)` + + Remove a previously registered event handler. + + + ```python + session.off('started', my_handler) + ``` + + ## Properties + + | Property | Type | Description | + |---|---|---| + | `id` | `Optional[str]` | Agent ID (set after `start()`) | + | `status` | `str` | Current state: `idle`, `starting`, `running`, `stopping`, `stopped`, `error` | + | `agent` | `Agent` | The agent configuration | + | `app_id` | `str` | Agora App ID | + | `raw` | `AgentsClient` / `AsyncAgentsClient` | Direct access to Fern-generated agents client | + + ## State Transitions + + | Current State | Allowed Actions | + |---|---| + | `idle` | `start()` | + | `starting` | (waiting for API) | + | `running` | `stop()`, `say()`, `interrupt()`, `update()`, `get_history()`, `get_info()` | + | `stopping` | (waiting for API) | + | `stopped` | `start()` (restart) | + | `error` | `start()` (retry) | + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.list_agents_response_data_list_item_status import ListAgentsResponseDataListItemStatus + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..types.asr import Asr + from ..types.llm import Llm + from ..types.llm_style import LlmStyle as GeneratedLlmStyle + from ..types.mllm import Mllm + from ..types.mllm_turn_detection import MllmTurnDetection + from ..types.mllm_turn_detection_mode import MllmTurnDetectionMode as GeneratedMllmTurnDetectionMode + from ..types.mllm_vendor import MllmVendor as GeneratedMllmVendor + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse, + ) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = Llm + LlmStyle = GeneratedLlmStyle + SttConfig = Asr + AsrConfig = SttConfig + SttVendor = typing.Any + TtsConfig = Tts + MllmConfig = Mllm + MllmVendor = GeneratedMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = MllmTurnDetection + MllmTurnDetectionMode = GeneratedMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + SessionStatus = ListAgentsResponseDataListItemStatus + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + + class ThinkOptions(typing_extensions.TypedDict, total=False): + on_listening_action: AgentThinkAgentManagementRequestOnListeningAction + on_thinking_action: AgentThinkAgentManagementRequestOnThinkingAction + on_speaking_action: AgentThinkAgentManagementRequestOnSpeakingAction + interruptable: bool + metadata: typing.Dict[str, str] + + + class GetTurnsOptions(typing_extensions.TypedDict, total=False): + page_index: int + page_size: int + + + class SayOptions(typing_extensions.TypedDict, total=False): + priority: SpeakAgentsRequestPriority + interruptable: bool + + + class SessionOptions(typing_extensions.TypedDict, total=False): + name: str + channel: str + token: str + agent_uid: str + remote_uids: typing.List[str] + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + # LLM sub-type aliases + LlmGreetingConfigs = typing.Dict[str, typing.Any] + LlmGreetingConfigsMode = typing.Any + McpServersItem = typing.Dict[str, typing.Any] + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + # Think type aliases and response + ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + ThinkResponse = AgentThinkAgentManagementResponse + + from .token import generate_convo_ai_token, _parse_numeric_uid, _validate_expires_in + + TurnDetectionLanguage = typing_extensions.Literal[ + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", + ] + + DEFAULT_TURN_DETECTION_LANGUAGE: TurnDetectionLanguage = "en-US" + TURN_DETECTION_LANGUAGE_VALUES: typing.Tuple[TurnDetectionLanguage, ...] = ( + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", + ) + _TURN_DETECTION_LANGUAGES = set(TURN_DETECTION_LANGUAGE_VALUES) + + + def _dump_optional_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + + def _is_turn_detection_language(value: typing.Any) -> bool: + return isinstance(value, str) and value in _TURN_DETECTION_LANGUAGES + + + def _validate_turn_detection_language(value: typing.Any) -> TurnDetectionLanguage: + if not _is_turn_detection_language(value): + raise ValueError(f"Invalid interaction language: {value}") + return value # type: ignore[return-value] + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Deprecated: + The Agent-level ``instructions``, ``greeting``, ``failure_message``, + ``max_history``, and ``greeting_configs`` convenience fields are kept + for compatibility. Configure those values on the LLM or MLLM vendor + instead. + + Examples + -------- + >>> from agora_agent import Agent, OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + greeting_configs: typing.Optional[LlmGreetingConfigs] = None, + pipeline_id: typing.Optional[str] = None, + ): + self._name = name + self._pipeline_id = pipeline_id + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + self._greeting_configs = greeting_configs + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + """Deprecated. Configure system messages on the LLM vendor instead.""" + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + """Deprecated. Configure the greeting on the LLM or MLLM vendor instead.""" + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_greeting_configs(self, configs: LlmGreetingConfigs) -> "Agent": + """Deprecated. Configure greeting playback on the LLM vendor instead.""" + new_agent = self._clone() + new_agent._greeting_configs = configs + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Deprecated. Configure the failure message on the LLM or MLLM vendor instead.""" + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Deprecated. Configure max history on the LLM vendor instead.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def pipeline_id(self) -> typing.Optional[str]: + """Published AI Studio pipeline ID used as this agent's base configuration.""" + return self._pipeline_id + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def greeting_configs(self) -> typing.Optional[LlmGreetingConfigs]: + return self._greeting_configs + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "pipeline_id": self._pipeline_id, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + "greeting_configs": self._greeting_configs, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + uid=_parse_numeric_uid(agent_uid, "agent_uid"), + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + base_kwargs["asr"] = self._resolve_asr_config() + base_kwargs["turn_detection"] = self._resolve_turn_detection_config() + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._greeting_configs is not None: + llm_config["greeting_configs"] = _dump_optional_model(self._greeting_configs) + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + + return StartAgentsRequestProperties(**base_kwargs) + + def _resolve_asr_config(self) -> typing.Dict[str, typing.Any]: + asr_config = dict(self._stt or {}) + asr_config.pop("language", None) + if not asr_config: + asr_config["vendor"] = "ares" + return asr_config + + def _resolve_turn_detection_config(self) -> TurnDetectionConfig: + existing_stt_language = self._stt.get("language") if self._stt is not None else None + existing_turn_detection_language = self._field_value(self._turn_detection, "language") + language = ( + existing_turn_detection_language + if existing_turn_detection_language is not None + else existing_stt_language + if _is_turn_detection_language(existing_stt_language) + else DEFAULT_TURN_DETECTION_LANGUAGE + ) + language = _validate_turn_detection_language(language) + if self._turn_detection is None: + return StartAgentsRequestPropertiesTurnDetection(language=language) + if isinstance(self._turn_detection, dict): + return typing.cast(TurnDetectionConfig, {**self._turn_detection, "language": language}) + return self._copy_model_update(self._turn_detection, {"language": language}) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._pipeline_id = self._pipeline_id + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + new_agent._greeting_configs = self._greeting_configs + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent, GetTurnsOptions, SayOptions, ThinkOptions + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token, _parse_numeric_uid + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, preset, + pipeline_id, expires_in, debug, warn + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(self._agent_uid, "agent_uid"), + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(str(params["agora_uid"]), "avatar agora_uid"), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties( + self, + token_opts: typing.Dict[str, typing.Any], + skip_vendor_validation: bool, + ) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=skip_vendor_validation, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.greeting_configs is not None: + llm["greeting_configs"] = self._dump_model(self._agent.greeting_configs) + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @staticmethod + def _response_pagination(response: typing.Any) -> typing.Any: + if isinstance(response, dict): + return response.get("pagination") + return getattr(response, "pagination", None) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> agent = agent.with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + pipeline_id = self._pipeline_id if self._pipeline_id is not None else self._agent.pipeline_id + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts, skip_vendor_validation=bool(self._preset or pipeline_id)) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> agent = agent.with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + pipeline_id = self._pipeline_id if self._pipeline_id is not None else self._agent.pipeline_id + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts, skip_vendor_validation=bool(self._preset or pipeline_id)) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + tests/custom/test_pipeline_id.py: | + import pytest + + from agora_agent import Agent + + + def dump(value): + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + + class StartResponse: + agent_id = "agent-id" + + + class FakeAgentsClient: + def __init__(self): + self.calls = [] + + def start(self, appid, **kwargs): + self.calls.append({"appid": appid, **kwargs}) + return StartResponse() + + + class FakeAsyncAgentsClient: + def __init__(self): + self.calls = [] + + async def start(self, appid, **kwargs): + self.calls.append({"appid": appid, **kwargs}) + return StartResponse() + + + class FakeClient: + app_id = "appid" + app_certificate = None + + def __init__(self, agents): + self.agents = agents + + + def start_agent(agent, **overrides): + agents = FakeAgentsClient() + client = FakeClient(agents) + options = { + "channel": "channel", + "token": "token", + "agent_uid": "1", + "remote_uids": ["100"], + **overrides, + } + + agent_id = agent.create_session(client, **options).start() + + assert agent_id == "agent-id" + assert len(agents.calls) == 1 + return agents.calls[0] + + + def test_agent_pipeline_id_sends_top_level_pipeline_id() -> None: + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + assert call["appid"] == "appid" + assert call["name"] == "support" + assert call["pipeline_id"] == "studio-pipeline-id" + properties = dump(call["properties"]) + assert properties["channel"] == "channel" + assert properties["token"] == "token" + assert properties["agent_rtc_uid"] == "1" + assert properties["remote_rtc_uids"] == ["100"] + + + def test_session_pipeline_id_overrides_agent_pipeline_id() -> None: + call = start_agent( + Agent(name="support", pipeline_id="agent-pipeline"), + pipeline_id="session-pipeline", + ) + + assert call["pipeline_id"] == "session-pipeline" + + + def test_agent_pipeline_id_skips_missing_vendor_validation() -> None: + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + assert call["pipeline_id"] == "studio-pipeline-id" + + + def test_pipeline_id_is_not_sent_inside_properties() -> None: + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + assert call["pipeline_id"] == "studio-pipeline-id" + assert "pipeline_id" not in dump(call["properties"]) + + + def test_pipeline_id_survives_builder_clone() -> None: + agent = Agent(name="support", pipeline_id="studio-pipeline-id").with_tools(True) + + assert agent.pipeline_id == "studio-pipeline-id" + call = start_agent(agent) + + assert call["pipeline_id"] == "studio-pipeline-id" + assert dump(call["properties"])["advanced_features"] == {"enable_tools": True} + + + @pytest.mark.asyncio + async def test_async_session_uses_agent_pipeline_id() -> None: + agents = FakeAsyncAgentsClient() + client = FakeClient(agents) + agent = Agent(name="support", pipeline_id="studio-pipeline-id") + + agent_id = await agent.create_async_session( + client, + channel="channel", + token="token", + agent_uid="1", + remote_uids=["100"], + ).start() + + assert agent_id == "agent-id" + assert agents.calls[0]["pipeline_id"] == "studio-pipeline-id" + assert "pipeline_id" not in dump(agents.calls[0]["properties"]) + status: unresolved + - id: patch-8e22e6d0 + content_hash: sha256:4baa4d46c129dde02b82a8367fdc1f9217d52267f82eb18f190d230d39a90927 + original_commit: 8e22e6d069e77f4c652e15f2f37945538c88c7c4 + original_message: udpated agent docs + original_author: Hermes (agora) + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/reference/agent.md + patch_content: |+ + From 8e22e6d069e77f4c652e15f2f37945538c88c7c4 Mon Sep 17 00:00:00 2001 + From: "Hermes (agora)" + Date: Tue, 2 Jun 2026 15:36:16 -0400 + Subject: [PATCH] udpated agent docs + + --- + docs/reference/agent.md | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + + diff --git a/docs/reference/agent.md b/docs/reference/agent.md + index 86d4fbd..5693e0b 100644 + --- a/docs/reference/agent.md + +++ b/docs/reference/agent.md + @@ -34,7 +34,6 @@ Agent( + | Parameter | Type | Default | Description | + |---|---|---|---| + | `name` | `Optional[str]` | `None` | Agent name, used as default session name | + -| `pipeline_id` | `Optional[str]` | `None` | Published AI Studio pipeline ID used as this agent's base configuration | + | `instructions` | `Optional[str]` | `None` | Deprecated. Use LLM vendor `system_messages` instead. | + | `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Interaction language and turn detection configuration | + | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | + @@ -48,6 +47,7 @@ Agent( + | `labels` | `Optional[Dict[str, str]]` | `None` | Custom key-value labels (returned in callbacks) | + | `rtc` | `Optional[RtcConfig]` | `None` | RTC media encryption | + | `filler_words` | `Optional[FillerWordsConfig]` | `None` | Filler words while waiting for LLM | + +| `pipeline_id` | `Optional[str]` | `None` | Published AI Studio pipeline ID used as this agent's base configuration | + + `pipeline_id` is an AI Studio base configuration. Explicit Agent config such as `with_llm()`, `with_tts()`, `with_stt()`, `with_mllm()`, `advanced_features`, and other builder options may send fields in `properties` that override the saved pipeline settings. Session-level `pipeline_id` overrides the agent-level value. + + -- + 2.52.0 + + theirs_snapshot: + docs/reference/agent.md: | + --- + sidebar_position: 2 + title: Agent + description: Full API reference for the Python Agent builder class. + --- + + # Agent Reference + + **Import:** `from agora_agent import Agent` + + ## Constructor + + + ```python + Agent( + name: Optional[str] = None, + instructions: Optional[str] = None, + turn_detection: Optional[TurnDetectionConfig] = None, + interruption: Optional[InterruptionConfig] = None, + sal: Optional[SalConfig] = None, + advanced_features: Optional[Dict[str, Any]] = None, + parameters: Optional[SessionParams] = None, + greeting: Optional[str] = None, + failure_message: Optional[str] = None, + max_history: Optional[int] = None, + geofence: Optional[GeofenceConfig] = None, + labels: Optional[Dict[str, str]] = None, + rtc: Optional[RtcConfig] = None, + filler_words: Optional[FillerWordsConfig] = None, + pipeline_id: Optional[str] = None, + ) + ``` + + | Parameter | Type | Default | Description | + |---|---|---|---| + | `name` | `Optional[str]` | `None` | Agent name, used as default session name | + | `instructions` | `Optional[str]` | `None` | Deprecated. Use LLM vendor `system_messages` instead. | + | `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Interaction language and turn detection configuration | + | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | + | `sal` | `Optional[SalConfig]` | `None` | Speech Activity Level configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | `None` | Advanced features dict (e.g., `{'enable_rtm': True}`) | + | `parameters` | `Optional[SessionParams]` | `None` | Additional session parameters | + | `greeting` | `Optional[str]` | `None` | Deprecated. Use LLM/MLLM vendor `greeting_message` instead. | + | `failure_message` | `Optional[str]` | `None` | Deprecated. Use LLM/MLLM vendor `failure_message` instead. | + | `max_history` | `Optional[int]` | `None` | Deprecated. Use LLM vendor `max_history` instead. | + | `geofence` | `Optional[GeofenceConfig]` | `None` | Regional access restriction | + | `labels` | `Optional[Dict[str, str]]` | `None` | Custom key-value labels (returned in callbacks) | + | `rtc` | `Optional[RtcConfig]` | `None` | RTC media encryption | + | `filler_words` | `Optional[FillerWordsConfig]` | `None` | Filler words while waiting for LLM | + | `pipeline_id` | `Optional[str]` | `None` | Published AI Studio pipeline ID used as this agent's base configuration | + + `pipeline_id` is an AI Studio base configuration. Explicit Agent config such as `with_llm()`, `with_tts()`, `with_stt()`, `with_mllm()`, `advanced_features`, and other builder options may send fields in `properties` that override the saved pipeline settings. Session-level `pipeline_id` overrides the agent-level value. + + The Agent-level `instructions`, `greeting`, `failure_message`, `max_history`, and `greeting_configs` fields are compatibility shims. New code should configure those values on the LLM or MLLM vendor because that matches the core request schema. + + ## Builder Methods + + All builder methods return a new `Agent` instance (immutable pattern). + + ### `with_llm(vendor: BaseLLM) -> Agent` + + Set the LLM vendor for cascading flow. + + + ```python + from agora_agent import OpenAI + agent = Agent().with_llm(OpenAI(api_key='your-key', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini')) + ``` + + ### `with_tts(vendor: BaseTTS) -> Agent` + + Set the TTS vendor. Records the vendor's `sample_rate` for avatar validation. + + + ```python + from agora_agent import ElevenLabsTTS + agent = Agent().with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1')) + ``` + + ### `with_stt(vendor: BaseSTT) -> Agent` + + Set the STT (ASR) vendor. + + + ```python + from agora_agent import DeepgramSTT + agent = Agent().with_stt(DeepgramSTT(api_key='your-key', language='en-US')) + ``` + + ### `with_mllm(vendor: BaseMLLM) -> Agent` + + Set the MLLM vendor for multimodal flow. Calling `with_mllm()` automatically sets `mllm.enable = True`. MLLM sessions do not require TTS, STT, or LLM vendors. + + + ```python + from agora_agent import OpenAIRealtime + agent = Agent().with_mllm(OpenAIRealtime(api_key='your-key')) + ``` + + ### `with_avatar(vendor: BaseAvatar) -> Agent` + + Set the avatar vendor for the cascading ASR + LLM + TTS pipeline. Avatars are not supported when MLLM is enabled — combining `with_mllm()` and an enabled `with_avatar()` is rejected at `to_properties()` and `AgentSession.start()`. A disabled avatar (`enable=False`) is allowed alongside MLLM. + + Raises `ValueError` if the TTS sample rate does not match the avatar's `required_sample_rate`. + + + ```python + from agora_agent import HeyGenAvatar + agent = agent.with_avatar(HeyGenAvatar(api_key='your-key', quality='medium', agora_uid='2')) + ``` + + **Raises:** `ValueError` — `"Avatar requires TTS sample rate of {required} Hz, but TTS is configured with {actual} Hz. Please update your TTS sample_rate to {required}."` + + ### `with_turn_detection(config: TurnDetectionConfig) -> Agent` + + Override cascading-flow turn detection settings. Use `language` for the Agora interaction language, `config.start_of_speech` and `config.end_of_speech` for SOS/EOS detection, `with_interruption()` for interruption behavior, and MLLM vendor `turn_detection` for MLLM turn detection. + + Pause-state detection is configured under semantic end-of-speech: + + ```python + agent = agent.with_turn_detection({ + "mode": "default", + "config": { + "end_of_speech": { + "mode": "semantic", + "semantic_config": { + "pause_state_enabled": True, + }, + }, + }, + }) + ``` + + ### `with_interruption(config: InterruptionConfig) -> Agent` + + Configure unified interruption behavior using the top-level `interruption` object. Use this for `start_of_speech` and `keywords` interruption modes. + + ### `with_instructions(instructions: str) -> Agent` + + Deprecated. Configure `system_messages` on the LLM vendor instead. + + ### `with_greeting(greeting: str) -> Agent` + + Deprecated. Configure `greeting_message` on the LLM or MLLM vendor instead. + + ### `with_name(name: str) -> Agent` + + Override the agent name. + + ### `with_sal(config: SalConfig) -> Agent` + + Set SAL (Selective Attention Locking) configuration. + + ### `with_advanced_features(features: AdvancedFeatures) -> Agent` + + Set advanced features (e.g. `{'enable_rtm': True}`). + + When `enable_rtm=True`, AgentKit defaults `parameters.data_channel` to `"rtm"` unless you explicitly set another data channel. + + ### `with_tools(enabled: bool = True) -> Agent` + + Enable or disable MCP tool invocation by setting `advanced_features.enable_tools`. + + ### `with_parameters(parameters: SessionParams) -> Agent` + + Set session parameters (silence config, farewell config, data channel, audio scenario, etc.). + + ### `with_audio_scenario(audio_scenario: ParametersAudioScenario) -> Agent` + + Set `parameters.audio_scenario` without replacing existing session parameters. + + ### `with_failure_message(message: str) -> Agent` + + Deprecated. Configure `failure_message` on the LLM or MLLM vendor instead. + + ### `with_max_history(max_history: int) -> Agent` + + Deprecated. Configure `max_history` on the LLM vendor instead. + + ### `with_geofence(geofence: GeofenceConfig) -> Agent` + + Set geofence configuration (restricts backend server regions). + + ### `with_labels(labels: Dict[str, str]) -> Agent` + + Set custom labels (key-value pairs returned in notification callbacks). + + ### `with_rtc(rtc: RtcConfig) -> Agent` + + Set RTC configuration. + + ### `with_filler_words(filler_words: FillerWordsConfig) -> Agent` + + Set filler words configuration (played while waiting for LLM response). + + ## `create_session()` + + + ```python + create_session( + client: Any, + channel: str, + agent_uid: str, + remote_uids: List[str], + name: Optional[str] = None, + token: Optional[str] = None, + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + preset: Optional[Union[str, Sequence[str]]] = None, + pipeline_id: Optional[str] = None, + expires_in: Optional[int] = None, + ) -> AgentSession + ``` + + Creates an `AgentSession` bound to the given client and channel. + + | Parameter | Type | Required | Description | + |---|---|---|---| + | `client` | `Agora` or `AsyncAgora` | Yes | Authenticated client | + | `channel` | `str` | Yes | Channel name | + | `agent_uid` | `str` | Yes | UID for the agent | + | `remote_uids` | `List[str]` | Yes | UIDs of remote participants | + | `name` | `Optional[str]` | No | Session name (defaults to agent name) | + | `token` | `Optional[str]` | No | Pre-built RTC+RTM token | + | `expires_in` | `Optional[int]` | No | Token lifetime in seconds (default: `86400` = 24 h, Agora max). Only applies when the token is auto-generated. Use `expires_in_hours()` or `expires_in_minutes()` for clarity. Valid range: 1–86400. | + | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | + | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | + | `preset` | `Optional[Union[str, Sequence[str]]]` | No | Advanced preset value for project-specific routing | + | `pipeline_id` | `Optional[str]` | No | Published AI Studio pipeline ID for this session. Overrides `agent.pipeline_id`. | + + `pipeline_id` is sent as the top-level `/join` field `pipeline_id`, not inside `properties`. + + **Returns:** `AgentSession` + + ## `to_properties()` + + Converts the agent configuration into a `StartAgentsRequestProperties` object for the Agora API. Called internally by `AgentSession.start()`. + + + ```python + to_properties( + channel: str, + agent_uid: str, + remote_uids: List[str], + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + token: Optional[str] = None, + app_id: Optional[str] = None, + app_certificate: Optional[str] = None, + expires_in: Optional[int] = None, + ) -> StartAgentsRequestProperties + ``` + + **Raises:** `ValueError` if neither `token` nor `app_id`+`app_certificate` is provided, or if required vendors (LLM, TTS) are missing in cascading mode. + + ## Properties + + | Property | Type | Description | + |---|---|---| + | `name` | `Optional[str]` | Agent name | + | `instructions` | `Optional[str]` | Deprecated Agent-level system prompt | + | `greeting` | `Optional[str]` | Deprecated Agent-level greeting message | + | `failure_message` | `Optional[str]` | Deprecated Agent-level failure message | + | `max_history` | `Optional[int]` | Deprecated Agent-level max history | + | `llm` | `Optional[Dict[str, Any]]` | LLM config dict (from `to_config()`) | + | `tts` | `Optional[Dict[str, Any]]` | TTS config dict | + | `stt` | `Optional[Dict[str, Any]]` | STT config dict | + | `mllm` | `Optional[Dict[str, Any]]` | MLLM config dict | + | `avatar` | `Optional[Dict[str, Any]]` | Avatar config dict | + | `turn_detection` | `Optional[TurnDetectionConfig]` | Interaction language and turn detection settings | + | `sal` | `Optional[SalConfig]` | SAL configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | Advanced features | + | `parameters` | `Optional[SessionParams]` | Session parameters | + | `geofence` | `Optional[GeofenceConfig]` | Geofence configuration | + | `labels` | `Optional[Dict[str, str]]` | Custom labels | + | `rtc` | `Optional[RtcConfig]` | RTC configuration | + | `filler_words` | `Optional[FillerWordsConfig]` | Filler words configuration | + | `config` | `Dict[str, Any]` | Full configuration dict | + + ## Type aliases + + Public aliases over Fern-generated types: `LlmConfig`, `SttConfig`, `AsrConfig` (= `SttConfig`), `MllmConfig`, `AvatarConfig`, session/conversation types, and think types (`ThinkOnListeningAction`, etc.). + + Think value constants: `ThinkOnListeningActionInject`, `ThinkOnListeningActionInterrupt`, `ThinkOnListeningActionIgnore`, `ThinkOnThinkingActionInterrupt`, `ThinkOnThinkingActionIgnore`, `ThinkOnSpeakingActionInterrupt`, `ThinkOnSpeakingActionIgnore`. + status: unresolved + - id: patch-bed29b6b + content_hash: sha256:8008d9c33a194a48ef317868953c26d5b03ede60c23743b4249260894c0f6417 + original_commit: bed29b6b7d4d08480a8510b26b5e21d1ef234cc9 + original_message: "chore: bump Python packages to 2.1.0" + original_author: digitallysavvy + base_generation: b66d871314ca0e5929cb9c9095949a7fd5e856a7 + files: + - compat/agora-agent-server-sdk/pyproject.toml + patch_content: | + diff --git a/compat/agora-agent-server-sdk/pyproject.toml b/compat/agora-agent-server-sdk/pyproject.toml + index ac93128..468294b 100644 + --- a/compat/agora-agent-server-sdk/pyproject.toml + +++ b/compat/agora-agent-server-sdk/pyproject.toml + @@ -3,7 +3,7 @@ name = "agora-agent-server-sdk" + + [tool.poetry] + name = "agora-agent-server-sdk" + -version = "v2.0.0" + +version = "v2.1.0" + description = "Compatibility shim for the renamed agora-agents package." + readme = "README.md" + authors = [] + @@ -35,7 +35,7 @@ Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-pyth + + [tool.poetry.dependencies] + python = "^3.8" + -agora-agents = ">=2.0.0,<3.0.0" + +agora-agents = ">=2.1.0,<3.0.0" + + [build-system] + requires = ["poetry-core"] + theirs_snapshot: + compat/agora-agent-server-sdk/pyproject.toml: | + [project] + name = "agora-agent-server-sdk" + + [tool.poetry] + name = "agora-agent-server-sdk" + version = "v2.1.0" + description = "Compatibility shim for the renamed agora-agents package." + readme = "README.md" + authors = [] + keywords = [] + + classifiers = [ + "Intended Audience :: Developers", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Operating System :: OS Independent", + "Operating System :: POSIX", + "Operating System :: MacOS", + "Operating System :: POSIX :: Linux", + "Operating System :: Microsoft :: Windows", + "Topic :: Software Development :: Libraries :: Python Modules", + "Typing :: Typed" + ] + packages = [ + { include = "agora_agent_server_sdk_compat", from = "src"} + ] + + [tool.poetry.urls] + Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python' + + [tool.poetry.dependencies] + python = "^3.8" + agora-agents = ">=2.1.0,<3.0.0" + + [build-system] + requires = ["poetry-core"] + build-backend = "poetry.core.masonry.api" + user_owned: true diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index cfa8580..48b9053 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -336,6 +336,7 @@ Use `turn_detection.language` for Agora interaction language; it defaults to `en | `api_key` | `str` | BYOK only | `None` | Deepgram API key. Optional only for Agora-managed `nova-2` and `nova-3`. | | `model` | `str` | No | `None` | Model (e.g., `nova-2`) | | `language` | `str` | No | `None` | Language code (e.g., `en-US`) | +| `interaction_language` | `str` | No | `None` | Agora `asr.language` override | | `smart_format` | `bool` | No | `None` | Enable smart formatting | | `punctuation` | `bool` | No | `None` | Enable punctuation | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | diff --git a/scripts/check_release_workflow.py b/scripts/check_release_workflow.py new file mode 100644 index 0000000..1a6e065 --- /dev/null +++ b/scripts/check_release_workflow.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 + +import re +import sys +from pathlib import Path +from typing import NoReturn + + +def fail(message: str) -> NoReturn: + print(message, file=sys.stderr) + raise SystemExit(1) + + +def read_version(path: str) -> str: + text = Path(path).read_text() + match = re.search(r'^version\s*=\s*"v?([^"]+)"', text, re.M) + if not match: + fail(f"version not found in {path}") + return match.group(1) + + +def read_compat_dependency(path: str) -> str: + text = Path(path).read_text() + match = re.search(r'^agora-agents\s*=\s*"([^"]+)"', text, re.M) + if not match: + fail(f"agora-agents dependency not found in {path}") + return match.group(1) + + +root_version = read_version("pyproject.toml") +compat_pyproject = "compat/agora-agent-server-sdk/pyproject.toml" +compat_version = read_version(compat_pyproject) +compat_dependency = read_compat_dependency(compat_pyproject) + +if compat_version != root_version: + fail(f"Compat package version ({compat_version}) must match root package version ({root_version}).") + +expected_dependency = f">={root_version},<3.0.0" +if compat_dependency != expected_dependency: + fail(f"Compat package dependency on agora-agents ({compat_dependency}) must be {expected_dependency}.") + +release_workflow = Path(".github/workflows/release.yml").read_text() +required_workflow_markers = [ + ("contents: write", "release workflow must have contents: write so it can create GitHub releases"), + ("gh release create", "release workflow must create a GitHub release when one does not exist"), + ("gh release edit", "release workflow must update an existing GitHub release"), + ("release_notes.md", "release workflow must generate and use a release notes file"), +] + +for marker, message in required_workflow_markers: + if marker not in release_workflow: + fail(message) + +print("Release metadata and workflow checks passed.") diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index 6275f04..ecf01c6 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -57,6 +57,8 @@ from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule from ..types.tts import Tts +from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule +from ..types.tts import Tts from ..types.asr import Asr from ..types.llm import Llm from ..types.llm_style import LlmStyle as GeneratedLlmStyle @@ -536,6 +538,23 @@ def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent ) return new_agent + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + def with_failure_message(self, message: str) -> "Agent": """Deprecated. Configure the failure message on the LLM or MLLM vendor instead.""" new_agent = self._clone() diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py index dbff562..dca9ee8 100644 --- a/src/agora_agent/agentkit/agent_session.py +++ b/src/agora_agent/agentkit/agent_session.py @@ -24,6 +24,7 @@ is_generic_avatar, is_heygen_avatar, is_live_avatar_avatar, + is_rtc_avatar, validate_avatar_config, validate_tts_sample_rate, ) diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py index 50bdd08..f48098c 100644 --- a/src/agora_agent/agentkit/vendors/avatar.py +++ b/src/agora_agent/agentkit/vendors/avatar.py @@ -177,6 +177,49 @@ def to_config(self) -> Dict[str, Any]: return {"enable": enable, "vendor": "generic", "params": params} +class GenericAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Generic avatar provider API key") + api_base_url: str = Field(..., description="Avatar provider API base URL") + avatar_id: str = Field(..., description="Avatar ID") + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + +class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "api_base_url": self.options.api_base_url, + "avatar_id": self.options.avatar_id, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_appid is not None: + params["agora_appid"] = self.options.agora_appid + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.agora_channel is not None: + params["agora_channel"] = self.options.agora_channel + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "generic", "params": params} + + class AnamAvatarOptions(BaseModel): model_config = ConfigDict(extra="forbid") diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py index 9156a01..5dd822d 100644 --- a/src/agora_agent/agentkit/vendors/llm.py +++ b/src/agora_agent/agentkit/vendors/llm.py @@ -1,7 +1,10 @@ -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union from pydantic import BaseModel, ConfigDict, Field, model_validator +from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + StartAgentsRequestPropertiesLlmGreetingConfigs, +) from .base import BaseLLM LlmGreetingConfigs = Dict[str, Any] diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py index 236a494..6a260d8 100644 --- a/src/agora_agent/agentkit/vendors/mllm.py +++ b/src/agora_agent/agentkit/vendors/mllm.py @@ -1,3 +1,4 @@ +import warnings from typing import Any, Dict, List, Optional from pydantic import BaseModel, ConfigDict, Field diff --git a/src/agora_agent/agentkit/vendors/stt.py b/src/agora_agent/agentkit/vendors/stt.py index e5117b0..bb222a9 100644 --- a/src/agora_agent/agentkit/vendors/stt.py +++ b/src/agora_agent/agentkit/vendors/stt.py @@ -89,6 +89,7 @@ class SpeechmaticsSTTOptions(BaseModel): api_key: str = Field(..., description="Speechmatics API key") language: str = Field(..., description="Language code (e.g., en, es, fr)") + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") model: Optional[str] = Field(default=None, description="Model name") uri: Optional[str] = Field(default=None, description="Speechmatics streaming WebSocket URL") additional_params: Optional[Dict[str, Any]] = Field(default=None) @@ -124,6 +125,7 @@ class DeepgramSTTOptions(BaseModel): api_key: Optional[str] = Field(default=None, description="Deepgram API key") model: Optional[str] = Field(default=None, description="Model (e.g., nova-2, enhanced, base)") language: Optional[str] = Field(default=None, description="Language code (e.g., en-US)") + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") smart_format: Optional[bool] = Field(default=None, description="Enable smart formatting") punctuation: Optional[bool] = Field(default=None, description="Enable punctuation") additional_params: Optional[Dict[str, Any]] = Field(default=None) @@ -353,6 +355,7 @@ class SarvamSTTOptions(BaseModel): api_key: str = Field(..., description="Sarvam API key") language: str = Field(..., description="Language code (e.g., en, hi, ta)") + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") model: Optional[str] = Field(default=None, description="Model name") additional_params: Optional[Dict[str, Any]] = Field(default=None) diff --git a/src/agora_agent/agentkit/vendors/tts.py b/src/agora_agent/agentkit/vendors/tts.py index a052ea5..3986986 100644 --- a/src/agora_agent/agentkit/vendors/tts.py +++ b/src/agora_agent/agentkit/vendors/tts.py @@ -139,6 +139,8 @@ def to_config(self) -> Dict[str, Any]: } if self.options.api_key is not None: params["api_key"] = self.options.api_key + if self.options.base_url is not None: + params["base_url"] = self.options.base_url params["base_url"] = self.options.base_url params["model"] = self.options.model elif self.options.model is not None: @@ -254,6 +256,8 @@ def to_config(self) -> Dict[str, Any]: "voice": self.options.voice_id, "engine": self.options.engine, } + if self.options.engine is not None: + params["engine"] = self.options.engine result: Dict[str, Any] = {"vendor": "amazon", "params": params} if self.options.skip_patterns is not None: @@ -392,6 +396,8 @@ def to_config(self) -> Dict[str, Any]: "reference_id": self.options.reference_id, "backend": self.options.backend, } + if self.options.backend is not None: + params["backend"] = self.options.backend result: Dict[str, Any] = {"vendor": "fishaudio", "params": params} if self.options.skip_patterns is not None: diff --git a/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py b/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py index 40dbb02..fb58a36 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py @@ -5,6 +5,7 @@ import pydantic from ...core.pydantic_utilities import IS_PYDANTIC_V2 from ...core.unchecked_base_model import UncheckedBaseModel +from ...types.asr_language import AsrLanguage from .start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig from .start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness from .start_agents_request_properties_turn_detection_interrupt_mode import ( @@ -18,6 +19,11 @@ class StartAgentsRequestPropertiesTurnDetection(UncheckedBaseModel): Conversation turn detection settings. Controls the logic for voice activity detection and conversation turn determination. This object has no effect when `mllm.enable` is true; use `mllm.turn_detection` instead. """ + language: typing.Optional[AsrLanguage] = pydantic.Field(default=None) + """ + BCP-47 language tag identifying the primary language used for agent interaction. + """ + mode: typing.Optional[typing.Literal["default"]] = pydantic.Field(default=None) """ Conversation turn detection mode: diff --git a/tests/custom/test_agentkit_agent.py b/tests/custom/test_agentkit_agent.py new file mode 100644 index 0000000..9719b04 --- /dev/null +++ b/tests/custom/test_agentkit_agent.py @@ -0,0 +1,298 @@ +from agora_agent.agentkit import ( + Agent, + AvatarConfig, + AvatarVendor, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, + SttConfig, + SttVendor, + TtsConfig, +) +import pytest + +from agora_agent.agentkit.vendors import ( + AkoolAvatar, + ElevenLabsTTS, + LiveAvatarAvatar, + OpenAI, + OpenAIRealtime, +) + + +def _parameter(config, key): + parameters = config["parameters"] + if isinstance(parameters, dict): + return parameters[key] + return getattr(parameters, key) + + +class _CopyOnlyModel: + def __init__(self, **values): + self.values = values + + def copy(self, update=None): + return _CopyOnlyModel(**{**self.values, **(update or {})}) + + +def test_generated_core_aliases_are_public(): + assert LlmConfig is not None + assert LlmStyle is not None + assert SttConfig is not None + assert SttVendor is not None + assert TtsConfig is not None + assert MllmConfig is not None + assert MllmVendor is not None + assert AvatarConfig is not None + assert AvatarVendor is not None + + +def test_model_copy_helper_supports_pydantic_v1_copy_api(): + copied = Agent._copy_model_update(_CopyOnlyModel(enable_rtm=True), {"data_channel": "rtm"}) # noqa: SLF001 + + assert copied.values == {"enable_rtm": True, "data_channel": "rtm"} + + +def test_with_audio_scenario_sets_session_parameter(): + agent = Agent(name="test").with_audio_scenario("chorus") + + assert _parameter(agent.config, "audio_scenario") == "chorus" + + +def test_with_audio_scenario_preserves_existing_parameters(): + agent = Agent(name="test", parameters={"enable_metrics": True}).with_audio_scenario( + "chorus" + ) + + assert _parameter(agent.config, "enable_metrics") is True + assert _parameter(agent.config, "audio_scenario") == "chorus" + + +def test_enable_rtm_defaults_data_channel_to_rtm(): + properties = Agent(name="test", advanced_features={"enable_rtm": True}).to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + skip_vendor_validation=True, + ) + + assert properties.parameters.data_channel == "rtm" + + +def test_enable_rtm_preserves_explicit_data_channel(): + properties = Agent( + name="test", + advanced_features={"enable_rtm": True}, + parameters={"data_channel": "datastream"}, + ).to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + skip_vendor_validation=True, + ) + + assert properties.parameters.data_channel == "datastream" + + +def test_agent_level_llm_fields_override_vendor_defaults(): + agent = ( + Agent(name="test") + .with_llm( + OpenAI( + api_key="llm-key", + greeting_message="vendor greeting", + failure_message="vendor failure", + max_history=1, + ) + ) + .with_tts(ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + + properties = agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + ) + + assert properties.llm.greeting_message == "agent greeting" + assert properties.llm.failure_message == "agent failure" + assert properties.llm.max_history == 2 + + +def test_avatar_sample_rate_validation_works_when_tts_added_after_avatar(): + agent = Agent(name="test").with_avatar( + LiveAvatarAvatar(api_key="avatar-key", quality="medium", agora_uid="2") + ) + + with pytest.raises(ValueError, match="24000"): + agent.with_tts( + ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice", sample_rate=16000) + ) + + +def test_avatar_sample_rate_validation_uses_wrapper_sample_rate(): + agent = ( + Agent(name="test") + .with_avatar(AkoolAvatar(api_key="avatar-key")) + .with_tts( + ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice", sample_rate=16000) + ) + ) + + assert agent.tts_sample_rate == 16000 + + +def test_with_mllm_removes_deprecated_advanced_features_enable_mllm(): + properties = ( + Agent( + name="test", + advanced_features={"enable_mllm": True, "enable_rtm": True}, + greeting="hello from agent", + failure_message="try again", + max_history=5, + ) + .with_mllm(OpenAIRealtime(api_key="openai-key")) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None + assert properties.mllm.enable is True + assert properties.mllm.greeting_message == "hello from agent" + assert properties.mllm.failure_message == "try again" + mllm_dump = properties.mllm.model_dump(exclude_none=True) + assert "max_history" not in mllm_dump + assert properties.advanced_features is not None + af_dump = properties.advanced_features.model_dump(exclude_none=True) + assert "enable_mllm" not in af_dump + assert af_dump.get("enable_rtm") is True + + +def test_to_properties_rejects_mllm_with_enabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + + +def test_to_properties_mllm_with_avatar_fires_before_token_generation(): + """The guard must fire before the token-generation step so callers get a + clear, actionable error even when app_id/app_certificate are empty. + """ + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + app_id="", + app_certificate="", + ) + + +def test_to_properties_rejects_mllm_with_default_enabled_avatar(): + """Avatar with no `enable` field should be treated as enabled.""" + agent = Agent(name="test").with_mllm(OpenAIRealtime(api_key="mllm-key")) + agent._avatar = { # noqa: SLF001 + "vendor": "liveavatar", + "params": { + "api_key": "avatar-key", + "quality": "high", + "agora_uid": "200", + "agora_token": "avatar-token", + }, + } + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + + +def test_to_properties_allows_mllm_with_disabled_avatar_and_no_tts(): + properties = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + enable=False, + ) + ) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None and properties.mllm.enable is True + assert properties.tts is None + assert properties.llm is None + assert properties.asr is None + assert properties.avatar is not None and properties.avatar.enable is False + + +def test_to_properties_mllm_without_tts_or_llm_succeeds(): + properties = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None and properties.mllm.enable is True + assert properties.tts is None + assert properties.llm is None + assert properties.asr is None + assert properties.avatar is None diff --git a/tests/custom/test_agentkit_session.py b/tests/custom/test_agentkit_session.py new file mode 100644 index 0000000..198fcd0 --- /dev/null +++ b/tests/custom/test_agentkit_session.py @@ -0,0 +1,383 @@ +from types import SimpleNamespace + +import pytest + +from agora_agent.agentkit import Agent, AgentSession +from agora_agent.agentkit.vendors import ( + ElevenLabsTTS, + GenericAvatar, + LiveAvatarAvatar, + OpenAI, + OpenAIRealtime, + RimeTTS, +) +from agora_agent.agents.types.get_turns_agents_response import GetTurnsAgentsResponse + + +APP_ID = "0" * 32 +APP_CERTIFICATE = "1" * 32 + + +class _Agents: + def __init__(self): + self.calls = [] + self.start_calls = [] + + def start(self, app_id, name, properties, preset=None, pipeline_id=None, request_options=None): + self.start_calls.append((app_id, name, properties, preset, pipeline_id, request_options)) + return SimpleNamespace(agent_id="agent-1") + + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + self.calls.append((app_id, agent_id, page_index, page_size, request_options)) + is_last_page = page_index != 1 + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=2, + pagination={ + "page_index": page_index or 1, + "total_pages": 2, + "is_last_page": is_last_page, + }, + turns=[{"turn_id": float(page_index or 1)}], + ) + + +class _Client: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _Agents() + self.agent_management = object() + + +def _session(agent, warn=None): + return AgentSession( + client=_Client(), + agent=agent, + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + warn=warn, + ) + + +def test_generic_avatar_enrichment_adds_session_context_and_token(): + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + ) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + params = properties["avatar"]["params"] + assert params["agora_appid"] == APP_ID + assert params["agora_channel"] == "room" + assert params["agora_token"] + assert params["agora_token"] != properties["token"] + + +def test_generic_avatar_empty_session_fields_are_filled(): + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + agora_appid="", + agora_channel="", + agora_token="", + ) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + params = properties["avatar"]["params"] + assert params["agora_appid"] == APP_ID + assert params["agora_channel"] == "room" + assert params["agora_token"] + + +def test_avatar_uid_matching_agent_uid_warns(): + warnings = [] + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="1", + ) + ) + session = _session(agent, warn=warnings.append) + + session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert any("matches agent_rtc_uid" in warning for warning in warnings) + + +def test_session_start_properties_agent_level_llm_fields_override_vendor_defaults(): + agent = ( + Agent(name="test") + .with_llm( + OpenAI( + api_key="llm-key", + greeting_message="vendor greeting", + failure_message="vendor failure", + max_history=1, + ) + ) + .with_tts(ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["llm"]["greeting_message"] == "agent greeting" + assert properties["llm"]["failure_message"] == "agent failure" + assert properties["llm"]["max_history"] == 2 + + +def test_session_start_properties_applies_mllm_agent_level_defaults(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["mllm"]["greeting_message"] == "agent greeting" + assert properties["mllm"]["failure_message"] == "agent failure" + assert "max_history" not in properties["mllm"] + + +def test_session_start_properties_preserves_mllm_vendor_defaults(): + agent = ( + Agent(name="test") + .with_mllm( + OpenAIRealtime( + api_key="mllm-key", + greeting_message="vendor greeting", + failure_message="vendor failure", + ) + ) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["mllm"]["greeting_message"] == "vendor greeting" + assert properties["mllm"]["failure_message"] == "vendor failure" + assert "max_history" not in properties["mllm"] + + +def test_session_start_allows_mllm_without_tts(): + agent = Agent(name="test").with_mllm(OpenAIRealtime(api_key="mllm-key")) + session = _session(agent) + + assert session.start() == "agent-1" + + +def test_session_start_rejects_mllm_with_enabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + session = _session(agent) + + with pytest.raises(ValueError, match="cascading"): + session.start() + assert session._client.agents.start_calls == [] # noqa: SLF001 + + +def test_session_start_allows_mllm_with_disabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + enable=False, + ) + ) + ) + session = _session(agent) + + assert session.start() == "agent-1" + + +def test_avatar_sample_rate_validation_uses_serialized_vendor_keys(): + warnings = [] + agent = ( + Agent(name="test") + .with_avatar(LiveAvatarAvatar(api_key="avatar-key", quality="medium", agora_uid="2")) + .with_tts(RimeTTS(key="tts-key", speaker="speaker", sampling_rate=24000)) + ) + session = _session(agent, warn=warnings.append) + + session._validate_avatar_config() # noqa: SLF001 + + assert warnings == [] + + +def test_avatar_user_token_is_not_overwritten(): + agent = Agent(name="test").with_avatar( + LiveAvatarAvatar( + api_key="live-key", + quality="medium", + agora_uid="2", + agora_token="user-token", + ) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["avatar"]["params"]["agora_token"] == "user-token" + + +def test_get_turns_forwards_pagination_args(): + session = _session(Agent(name="test")) + session._agent_id = "agent-id" # noqa: SLF001 + + session.get_turns(page_index=3, page_size=25) + + assert session._client.agents.calls[-1][:4] == (APP_ID, "agent-id", 3, 25) # noqa: SLF001 + + +def test_get_all_turns_aggregates_pages(): + session = _session(Agent(name="test")) + session._agent_id = "agent-id" # noqa: SLF001 + + response = session.get_all_turns(page_size=1) + + assert [turn.turn_id for turn in response.turns] == [1.0, 2.0] + assert response.pagination.page_index == 2 + + +def test_get_all_turns_raises_when_pagination_does_not_advance(): + class _StuckAgents: + def __init__(self): + self.calls = 0 + + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + self.calls += 1 + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=2, + pagination={"page_index": 1, "is_last_page": False}, + turns=[{"turn_id": 1.0}], + ) + + class _StuckClient: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _StuckAgents() + self.agent_management = object() + + session = AgentSession( + client=_StuckClient(), + agent=Agent(name="test"), + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + ) + session._agent_id = "agent-id" # noqa: SLF001 + + with pytest.raises(RuntimeError, match="did not advance"): + session.get_all_turns(page_size=1) + + +def test_get_all_turns_raises_when_pagination_metadata_missing(): + class _NoMetaAgents: + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=1, + pagination={"is_last_page": False}, + turns=[{"turn_id": 1.0}], + ) + + class _NoMetaClient: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _NoMetaAgents() + self.agent_management = object() + + session = AgentSession( + client=_NoMetaClient(), + agent=Agent(name="test"), + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + ) + session._agent_id = "agent-id" # noqa: SLF001 + + with pytest.raises(RuntimeError, match="cannot continue"): + session.get_all_turns(page_size=1) diff --git a/tests/custom/test_agentkit_vendors.py b/tests/custom/test_agentkit_vendors.py new file mode 100644 index 0000000..8473821 --- /dev/null +++ b/tests/custom/test_agentkit_vendors.py @@ -0,0 +1,122 @@ +import pytest +from pydantic import ValidationError + +from agora_agent.agentkit import LlmGreetingConfigs +import warnings + +from agora_agent.agentkit.vendors import GenericAvatar, OpenAI, OpenAIRealtime, XaiGrok, XaiRealtime + + +def test_xai_grok_serializes_v27_shape_without_style(): + config = XaiGrok( + api_key="xai-key", + voice="eve", + language="en", + sample_rate=24000, + output_modalities=["audio", "text"], + params={"temperature": 0.2}, + ).to_config() + + assert config["vendor"] == "xai" + assert config["url"] == "wss://api.x.ai/v1/realtime" + assert config["api_key"] == "xai-key" + assert config["params"] == { + "temperature": 0.2, + "voice": "eve", + "language": "en", + "sample_rate": 24000, + } + assert config["output_modalities"] == ["audio", "text"] + assert "style" not in config + + +def test_xai_grok_emits_params_even_when_empty(): + assert XaiGrok(api_key="xai-key").to_config()["params"] == {} + + +def test_xai_realtime_deprecated_alias_emits_same_vendor(): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always", DeprecationWarning) + config = XaiRealtime(api_key="xai-key").to_config() + assert len(caught) == 1 + assert issubclass(caught[0].category, DeprecationWarning) + assert config["vendor"] == "xai" + + +def test_mllm_rejects_fields_not_in_core_contract(): + with pytest.raises(ValidationError): + OpenAIRealtime(api_key="openai-key", predefined_tools=["_publish_message"]) + + with pytest.raises(ValidationError): + XaiGrok(api_key="xai-key", max_history=10) + + +def test_generic_avatar_omits_session_enriched_fields_when_unset(): + config = GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + ).to_config() + + assert config == { + "enable": True, + "vendor": "generic", + "params": { + "api_key": "avatar-key", + "api_base_url": "https://avatar.example.com", + "avatar_id": "avatar-1", + "agora_uid": "2", + }, + } + + +def test_vertex_ai_explicit_fields_override_additional_params(): + from agora_agent.agentkit.vendors import VertexAI + + config = VertexAI( + model="explicit-model", + project_id="explicit-project", + location="explicit-region", + adc_credentials_string="{}", + additional_params={ + "model": "should-be-overridden", + "project_id": "should-be-overridden", + "location": "should-be-overridden", + "adc_credentials_string": "should-be-overridden", + "extra_key": "kept", + }, + ).to_config() + + assert config["vendor"] == "vertexai" + assert config["params"]["model"] == "explicit-model" + assert config["params"]["project_id"] == "explicit-project" + assert config["params"]["location"] == "explicit-region" + assert config["params"]["adc_credentials_string"] == "{}" + assert config["params"]["extra_key"] == "kept" + + +def test_gemini_live_explicit_fields_override_additional_params(): + from agora_agent.agentkit.vendors import GeminiLive + + config = GeminiLive( + api_key="key", + model="explicit-model", + additional_params={ + "model": "should-be-overridden", + "extra_key": "kept", + }, + ).to_config() + + assert config["params"]["model"] == "explicit-model" + assert config["params"]["extra_key"] == "kept" + + +def test_llm_greeting_configs_interruptable_serializes(): + config = OpenAI( + api_key="openai-key", + greeting_configs=LlmGreetingConfigs(mode="single_first", interruptable=False), + ).to_config() + + assert config["greeting_configs"]["mode"] == "single_first" + assert config["greeting_configs"]["interruptable"] is False