From 1ca34d3873b54f7ca8e20b0686ae3a8be3a6cd12 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Wed, 3 Jun 2026 10:58:52 -0400 Subject: [PATCH 01/14] Separate STT provider language from turn detection defaults --- changelog.md | 2 +- docs/concepts/vendors.md | 2 +- docs/reference/vendors.md | 2 +- src/agora_agent/agentkit/agent.py | 7 +++---- tests/custom/test_stt_language.py | 16 ++++++++-------- 5 files changed, 14 insertions(+), 15 deletions(-) diff --git a/changelog.md b/changelog.md index dc8dcc6..896e12d 100644 --- a/changelog.md +++ b/changelog.md @@ -8,7 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). ### Added -- **Turn detection language** — AgentKit now manages Agora interaction language through `turn_detection.language`, validates it against the supported BCP-47 language list, and sends the default `en-US` when no language is provided. +- **Turn detection language** — AgentKit now manages Agora interaction language through `turn_detection.language`, validates it against the supported BCP-47 language list, and sends the default `en` when no language is provided. - **Provider parameter parity** — ASR, LLM, MLLM, TTS, and avatar wrappers expose typed provider parameters plus passthrough fields where the generated core supports additional properties. ### Changed diff --git a/docs/concepts/vendors.md b/docs/concepts/vendors.md index c59ae7c..afddf19 100644 --- a/docs/concepts/vendors.md +++ b/docs/concepts/vendors.md @@ -75,7 +75,7 @@ tts = ElevenLabsTTS( Used with `agent.with_stt()`. -Use `turn_detection.language` for Agora interaction language; it defaults to `en-US`. STT vendor `language` options are serialized under `asr.params` using each provider's own format. +Use `turn_detection.language` for Agora interaction language; it defaults to `en`. STT vendor `language` options are serialized under `asr.params` using each provider's own format. | Class | Provider | Required Parameters | |---|---|---| diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index cfa8580..c822988 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -318,7 +318,7 @@ The SDK also includes named helpers for the remaining Agora-supported LLM provid ## STT Vendors -Use `turn_detection.language` for Agora interaction language; it defaults to `en-US`. Provider-specific language values remain under `asr.params` and may use a different format. +Use `turn_detection.language` for Agora interaction language; it defaults to `en`. Provider-specific language values remain under `asr.params` and may use a different format. ### `SpeechmaticsSTT` diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index 6275f04..f684228 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -221,6 +221,7 @@ class SessionOptions(typing_extensions.TypedDict, total=False): "zh-HK", "zh-TW", "nl-NL", + "en", "en-IN", "en-US", "fil-PH", @@ -246,7 +247,7 @@ class SessionOptions(typing_extensions.TypedDict, total=False): "vi-VN", ] -DEFAULT_TURN_DETECTION_LANGUAGE: TurnDetectionLanguage = "en-US" +DEFAULT_TURN_DETECTION_LANGUAGE: TurnDetectionLanguage = "en" TURN_DETECTION_LANGUAGE_VALUES: typing.Tuple[TurnDetectionLanguage, ...] = ( "ar-EG", "ar-JO", @@ -257,6 +258,7 @@ class SessionOptions(typing_extensions.TypedDict, total=False): "zh-HK", "zh-TW", "nl-NL", + "en", "en-IN", "en-US", "fil-PH", @@ -963,13 +965,10 @@ def _resolve_asr_config(self) -> typing.Dict[str, typing.Any]: return asr_config def _resolve_turn_detection_config(self) -> TurnDetectionConfig: - existing_stt_language = self._stt.get("language") if self._stt is not None else None existing_turn_detection_language = self._field_value(self._turn_detection, "language") language = ( existing_turn_detection_language if existing_turn_detection_language is not None - else existing_stt_language - if _is_turn_detection_language(existing_stt_language) else DEFAULT_TURN_DETECTION_LANGUAGE ) language = _validate_turn_detection_language(language) diff --git a/tests/custom/test_stt_language.py b/tests/custom/test_stt_language.py index c398e02..5690ff0 100644 --- a/tests/custom/test_stt_language.py +++ b/tests/custom/test_stt_language.py @@ -39,13 +39,13 @@ def properties(agent: Agent) -> dict: ) -def test_bcp47_stt_language_sets_turn_detection_language_and_provider_param() -> None: - props = properties(base_agent().with_stt(SpeechmaticsSTT(api_key="stt-key", language="en-US"))) +def test_bcp47_stt_language_stays_in_asr_params_and_defaults_turn_detection() -> None: + props = properties(base_agent().with_stt(SpeechmaticsSTT(api_key="stt-key", language="en"))) assert props["asr"]["vendor"] == "speechmatics" assert "language" not in props["asr"] - assert props["turn_detection"]["language"] == "en-US" - assert props["asr"]["params"]["language"] == "en-US" + assert props["turn_detection"]["language"] == "en" + assert props["asr"]["params"]["language"] == "en" def test_provider_language_defaults_turn_detection_language_when_not_supported_by_ares() -> None: @@ -53,7 +53,7 @@ def test_provider_language_defaults_turn_detection_language_when_not_supported_b assert props["asr"]["vendor"] == "speechmatics" assert "language" not in props["asr"] - assert props["turn_detection"]["language"] == "en-US" + assert props["turn_detection"]["language"] == "en" assert props["asr"]["params"]["language"] == "en" @@ -71,15 +71,15 @@ def test_turn_detection_language_can_differ_from_provider_language() -> None: def test_invalid_turn_detection_language_is_rejected() -> None: - with pytest.raises(ValueError, match="Invalid interaction language: en"): - properties(Agent(turn_detection=TurnDetectionConfig(language="en"))) # type: ignore[arg-type] + with pytest.raises(ValueError, match="Invalid interaction language: xx"): + properties(Agent(turn_detection=TurnDetectionConfig(language="xx"))) # type: ignore[arg-type] def test_default_turn_detection_language_is_sent_without_stt() -> None: props = properties(base_agent()) assert props["asr"] == {"vendor": "ares"} - assert props["turn_detection"] == {"language": "en-US"} + assert props["turn_detection"] == {"language": "en"} def test_stt_vendor_params_match_documented_shapes() -> None: From fdfa4ac4cf9d4211878a5b734a9b63efa44bfd19 Mon Sep 17 00:00:00 2001 From: "Hermes (agora)" Date: Wed, 3 Jun 2026 19:24:15 -0400 Subject: [PATCH 02/14] fix(agentkit): source ASR language from turn detection Optional longer body: Keep provider STT language settings inside asr.params, populate REST asr.language from turn_detection.language, and treat Ares as provider-only. --- README.md | 2 +- changelog.md | 6 +- docs/concepts/vendors.md | 2 +- docs/reference/vendors.md | 3 +- src/agora_agent/agentkit/agent.py | 11 +-- src/agora_agent/agentkit/vendors/stt.py | 106 +----------------------- tests/custom/test_stt_language.py | 44 +++++++--- 7 files changed, 47 insertions(+), 127 deletions(-) diff --git a/README.md b/README.md index c8cbabf..4dee35d 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ pip install agora-agents ## Quick Start Start with the `Agent` builder: create a client with app credentials, choose your ASR, LLM, and TTS providers, then start a session. Omit vendor API keys for supported Agora-managed models, or provide keys when you want BYOK. -Set Agora interaction language with `turn_detection.language`; provider-specific STT language values remain under `asr.params`. +Set Agora interaction language with `turn_detection.language`; provider-specific STT language values remain under `asr.params`. Ares uses only the REST `asr.language` value sourced from `turn_detection.language`. ```python import os diff --git a/changelog.md b/changelog.md index 896e12d..9050db2 100644 --- a/changelog.md +++ b/changelog.md @@ -8,7 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). ### Added -- **Turn detection language** — AgentKit now manages Agora interaction language through `turn_detection.language`, validates it against the supported BCP-47 language list, and sends the default `en` when no language is provided. +- **Turn detection language** — AgentKit now manages Agora interaction language through `turn_detection.language`, validates it against the supported BCP-47 language list, and sends the default `en-US` when no language is provided. - **Provider parameter parity** — ASR, LLM, MLLM, TTS, and avatar wrappers expose typed provider parameters plus passthrough fields where the generated core supports additional properties. ### Changed @@ -21,7 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). ### Fixed - **Managed-provider validation** — AgentKit validation now distinguishes preset-backed providers from BYOK providers so required provider fields are only required when credentials are caller-supplied. -- **Language placement** — Provider-specific STT language values remain under `asr.params`, while Agora interaction language is emitted separately as `turn_detection.language`. +- **Language placement** — Provider-specific STT language values remain under `asr.params`; the REST `asr.language` field is populated from `turn_detection.language`. ## [v2.0.0] — 2026-05-21 @@ -114,7 +114,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). ### Fixed -- **`AresSTT`** — Removed redundant `language` key from the `params` dict. Language is now emitted only at the top level. `params` is only included when `additional_params` is provided. +- **`AresSTT`** — Removed redundant `language` key from the `params` dict. Ares only selects the provider; AgentKit populates REST `asr.language` from `turn_detection.language`. `params` is only included when `additional_params` is provided. - **`OpenAIRealtime` / `VertexAI` (MLLM)** — Agent-level `greeting` and `failure_message` defaults are now correctly applied when missing in MLLM mode. Previously these values were silently dropped. - **`VertexAI` (MLLM)** — `messages` is emitted at the MLLM top level, matching the generated core SDK contract. diff --git a/docs/concepts/vendors.md b/docs/concepts/vendors.md index afddf19..2ec5439 100644 --- a/docs/concepts/vendors.md +++ b/docs/concepts/vendors.md @@ -75,7 +75,7 @@ tts = ElevenLabsTTS( Used with `agent.with_stt()`. -Use `turn_detection.language` for Agora interaction language; it defaults to `en`. STT vendor `language` options are serialized under `asr.params` using each provider's own format. +Use `turn_detection.language` for Agora interaction language; it defaults to `en-US`. STT vendor `language` options are serialized under `asr.params` using each provider's own format. Ares does not take a provider language option; AgentKit uses `turn_detection.language` for REST `asr.language`. | Class | Provider | Required Parameters | |---|---|---| diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index c822988..42d59eb 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -318,7 +318,7 @@ The SDK also includes named helpers for the remaining Agora-supported LLM provid ## STT Vendors -Use `turn_detection.language` for Agora interaction language; it defaults to `en`. Provider-specific language values remain under `asr.params` and may use a different format. +Use `turn_detection.language` for Agora interaction language; it defaults to `en-US`. Provider-specific language values remain under `asr.params` and may use a different format. AgentKit populates REST `asr.language` from `turn_detection.language`. ### `SpeechmaticsSTT` @@ -396,7 +396,6 @@ For `nova-2` and `nova-3`, omit `api_key` to use Agora-managed credentials. For | Parameter | Type | Required | Default | Description | |---|---|---|---|---| -| `language` | `str` | No | `None` | Language code | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | ### `SarvamSTT` diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index f684228..f7ba770 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -247,7 +247,7 @@ class SessionOptions(typing_extensions.TypedDict, total=False): "vi-VN", ] -DEFAULT_TURN_DETECTION_LANGUAGE: TurnDetectionLanguage = "en" +DEFAULT_TURN_DETECTION_LANGUAGE: TurnDetectionLanguage = "en-US" TURN_DETECTION_LANGUAGE_VALUES: typing.Tuple[TurnDetectionLanguage, ...] = ( "ar-EG", "ar-JO", @@ -921,9 +921,10 @@ def to_properties( allow_missing_llm = "llm" in allow_missing_categories allow_missing_tts = "tts" in allow_missing_categories + turn_detection_config = self._resolve_turn_detection_config() if not skip_asr_validation and (self._stt is not None or not allow_missing_asr): - base_kwargs["asr"] = self._resolve_asr_config() - base_kwargs["turn_detection"] = self._resolve_turn_detection_config() + base_kwargs["asr"] = self._resolve_asr_config(turn_detection_config) + base_kwargs["turn_detection"] = turn_detection_config if skip_vendor_validation: return StartAgentsRequestProperties(**base_kwargs) @@ -957,11 +958,11 @@ def _resolve_llm_config(self) -> typing.Dict[str, typing.Any]: llm_config["max_history"] = self._max_history return llm_config - def _resolve_asr_config(self) -> typing.Dict[str, typing.Any]: + def _resolve_asr_config(self, turn_detection_config: TurnDetectionConfig) -> typing.Dict[str, typing.Any]: asr_config = dict(self._stt or {}) - asr_config.pop("language", None) if not asr_config: asr_config["vendor"] = "ares" + asr_config["language"] = self._field_value(turn_detection_config, "language") return asr_config def _resolve_turn_detection_config(self) -> TurnDetectionConfig: diff --git a/src/agora_agent/agentkit/vendors/stt.py b/src/agora_agent/agentkit/vendors/stt.py index e5117b0..383147a 100644 --- a/src/agora_agent/agentkit/vendors/stt.py +++ b/src/agora_agent/agentkit/vendors/stt.py @@ -1,89 +1,12 @@ -from typing import Any, Dict, Optional, Tuple +from typing import Any, Dict, Optional from pydantic import BaseModel, ConfigDict, Field, model_validator -from typing_extensions import Literal from .base import BaseSTT -TurnDetectionLanguage = Literal[ - "ar-EG", - "ar-JO", - "ar-SA", - "ar-AE", - "bn-IN", - "zh-CN", - "zh-HK", - "zh-TW", - "nl-NL", - "en-IN", - "en-US", - "fil-PH", - "fr-FR", - "de-DE", - "gu-IN", - "he-IL", - "hi-IN", - "id-ID", - "it-IT", - "ja-JP", - "kn-IN", - "ko-KR", - "ms-MY", - "fa-IR", - "pt-PT", - "ru-RU", - "es-ES", - "ta-IN", - "te-IN", - "th-TH", - "tr-TR", - "vi-VN", -] - -TURN_DETECTION_LANGUAGE_VALUES: Tuple[TurnDetectionLanguage, ...] = ( - "ar-EG", - "ar-JO", - "ar-SA", - "ar-AE", - "bn-IN", - "zh-CN", - "zh-HK", - "zh-TW", - "nl-NL", - "en-IN", - "en-US", - "fil-PH", - "fr-FR", - "de-DE", - "gu-IN", - "he-IL", - "hi-IN", - "id-ID", - "it-IT", - "ja-JP", - "kn-IN", - "ko-KR", - "ms-MY", - "fa-IR", - "pt-PT", - "ru-RU", - "es-ES", - "ta-IN", - "te-IN", - "th-TH", - "tr-TR", - "vi-VN", -) -_TURN_DETECTION_LANGUAGES = set(TURN_DETECTION_LANGUAGE_VALUES) _DEEPGRAM_MANAGED_MODELS = {"nova-2", "nova-3"} -def _turn_detection_language(language: Optional[str]) -> Optional[TurnDetectionLanguage]: - if language in _TURN_DETECTION_LANGUAGES: - return language # type: ignore[return-value] - return None - - class SpeechmaticsSTTOptions(BaseModel): model_config = ConfigDict(extra="forbid") @@ -112,9 +35,6 @@ def to_config(self) -> Dict[str, Any]: "vendor": "speechmatics", "params": params, } - turn_detection_language = _turn_detection_language(self.options.language) - if turn_detection_language is not None: - config["language"] = turn_detection_language return config @@ -155,9 +75,6 @@ def to_config(self) -> Dict[str, Any]: "vendor": "deepgram", "params": params, } - turn_detection_language = _turn_detection_language(self.options.language) - if turn_detection_language is not None: - config["language"] = turn_detection_language return config @@ -186,9 +103,6 @@ def to_config(self) -> Dict[str, Any]: "vendor": "microsoft", "params": params, } - turn_detection_language = _turn_detection_language(self.options.language) - if turn_detection_language is not None: - config["language"] = turn_detection_language return config @@ -223,9 +137,6 @@ def to_config(self) -> Dict[str, Any]: "vendor": "openai", "params": params, } - turn_detection_language = _turn_detection_language(self.options.language) - if turn_detection_language is not None: - config["language"] = turn_detection_language return config @@ -260,9 +171,6 @@ def to_config(self) -> Dict[str, Any]: "vendor": "google", "params": params, } - turn_detection_language = _turn_detection_language(self.options.language) - if turn_detection_language is not None: - config["language"] = turn_detection_language return config @@ -293,9 +201,6 @@ def to_config(self) -> Dict[str, Any]: "vendor": "amazon", "params": params, } - turn_detection_language = _turn_detection_language(self.options.language) - if turn_detection_language is not None: - config["language"] = turn_detection_language return config @@ -323,16 +228,12 @@ def to_config(self) -> Dict[str, Any]: "vendor": "assemblyai", "params": params, } - turn_detection_language = _turn_detection_language(self.options.language) - if turn_detection_language is not None: - config["language"] = turn_detection_language return config class AresSTTOptions(BaseModel): model_config = ConfigDict(extra="forbid") - language: Optional[TurnDetectionLanguage] = Field(default=None, description="Language code") additional_params: Optional[Dict[str, Any]] = Field(default=None) class AresSTT(BaseSTT): @@ -341,8 +242,6 @@ def __init__(self, **kwargs: Any): def to_config(self) -> Dict[str, Any]: config: Dict[str, Any] = {"vendor": "ares"} - if self.options.language is not None: - config["language"] = self.options.language if self.options.additional_params: config["params"] = self.options.additional_params return config @@ -373,7 +272,4 @@ def to_config(self) -> Dict[str, Any]: "vendor": "sarvam", "params": params, } - turn_detection_language = _turn_detection_language(self.options.language) - if turn_detection_language is not None: - config["language"] = turn_detection_language return config diff --git a/tests/custom/test_stt_language.py b/tests/custom/test_stt_language.py index 5690ff0..aa128a9 100644 --- a/tests/custom/test_stt_language.py +++ b/tests/custom/test_stt_language.py @@ -43,17 +43,17 @@ def test_bcp47_stt_language_stays_in_asr_params_and_defaults_turn_detection() -> props = properties(base_agent().with_stt(SpeechmaticsSTT(api_key="stt-key", language="en"))) assert props["asr"]["vendor"] == "speechmatics" - assert "language" not in props["asr"] - assert props["turn_detection"]["language"] == "en" + assert props["asr"]["language"] == "en-US" + assert props["turn_detection"]["language"] == "en-US" assert props["asr"]["params"]["language"] == "en" -def test_provider_language_defaults_turn_detection_language_when_not_supported_by_ares() -> None: +def test_provider_language_does_not_set_turn_detection_language() -> None: props = properties(base_agent().with_stt(SpeechmaticsSTT(api_key="stt-key", language="en"))) assert props["asr"]["vendor"] == "speechmatics" - assert "language" not in props["asr"] - assert props["turn_detection"]["language"] == "en" + assert props["asr"]["language"] == "en-US" + assert props["turn_detection"]["language"] == "en-US" assert props["asr"]["params"]["language"] == "en" @@ -66,7 +66,7 @@ def test_turn_detection_language_can_differ_from_provider_language() -> None: ) assert props["turn_detection"]["language"] == "fr-FR" - assert "language" not in props["asr"] + assert props["asr"]["language"] == "fr-FR" assert props["asr"]["params"]["language"] == "en" @@ -78,12 +78,14 @@ def test_invalid_turn_detection_language_is_rejected() -> None: def test_default_turn_detection_language_is_sent_without_stt() -> None: props = properties(base_agent()) - assert props["asr"] == {"vendor": "ares"} - assert props["turn_detection"] == {"language": "en"} + assert props["asr"] == {"vendor": "ares", "language": "en-US"} + assert props["turn_detection"] == {"language": "en-US"} def test_stt_vendor_params_match_documented_shapes() -> None: - assert DeepgramSTT(model="nova-3", language="en-US").to_config()["params"] == { + deepgram_managed = DeepgramSTT(model="nova-3", language="en-US").to_config() + assert "language" not in deepgram_managed + assert deepgram_managed["params"] == { "model": "nova-3", "language": "en-US", } @@ -132,8 +134,30 @@ def test_stt_vendor_params_match_documented_shapes() -> None: "language_code": "en-US", } - assert AssemblyAISTT(api_key="assembly-key", language="en-US", uri="wss://example.test/ws").to_config()["params"] == { + assemblyai_config = AssemblyAISTT(api_key="assembly-key", language="en-US", uri="wss://example.test/ws").to_config() + assert "language" not in assemblyai_config + assert assemblyai_config["params"] == { "api_key": "assembly-key", "language": "en-US", "uri": "wss://example.test/ws", } + + +def test_assemblyai_params_stay_nested_and_asr_language_comes_from_turn_detection() -> None: + props = properties( + Agent(turn_detection=TurnDetectionConfig(language="fr-FR")) + .with_llm(OpenAI(api_key="llm-key", model="gpt-4o-mini", base_url="https://api.openai.com/v1/chat/completions")) + .with_tts(ElevenLabsTTS(key="tts-key", voice_id="voice", model_id="eleven_flash_v2_5", base_url="wss://api.elevenlabs.io/v1")) + .with_stt(AssemblyAISTT(api_key="assembly-key", language="en-US", uri="wss://example.test/ws")) + ) + + assert props["asr"] == { + "vendor": "assemblyai", + "language": "fr-FR", + "params": { + "api_key": "assembly-key", + "language": "en-US", + "uri": "wss://example.test/ws", + }, + } + assert props["turn_detection"] == {"language": "fr-FR"} From b66d871314ca0e5929cb9c9095949a7fd5e856a7 Mon Sep 17 00:00:00 2001 From: fern-api <115122769+fern-api[bot]@users.noreply.github.com> Date: Thu, 4 Jun 2026 20:30:41 +0000 Subject: [PATCH 03/14] [fern-generated] Update SDK Generated by Fern CLI Version: unknown Generators: - fernapi/fern-python-sdk: 4.37.0 --- scripts/check_release_workflow.py | 54 ------------------- ...gents_request_properties_turn_detection.py | 6 --- src/agora_agent/core/client_wrapper.py | 4 +- src/agora_agent/types/asr.py | 1 - src/agora_agent/types/deepgram_asr.py | 4 -- src/agora_agent/types/deepgram_asr_params.py | 2 +- 6 files changed, 3 insertions(+), 68 deletions(-) delete mode 100644 scripts/check_release_workflow.py diff --git a/scripts/check_release_workflow.py b/scripts/check_release_workflow.py deleted file mode 100644 index 1a6e065..0000000 --- a/scripts/check_release_workflow.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 - -import re -import sys -from pathlib import Path -from typing import NoReturn - - -def fail(message: str) -> NoReturn: - print(message, file=sys.stderr) - raise SystemExit(1) - - -def read_version(path: str) -> str: - text = Path(path).read_text() - match = re.search(r'^version\s*=\s*"v?([^"]+)"', text, re.M) - if not match: - fail(f"version not found in {path}") - return match.group(1) - - -def read_compat_dependency(path: str) -> str: - text = Path(path).read_text() - match = re.search(r'^agora-agents\s*=\s*"([^"]+)"', text, re.M) - if not match: - fail(f"agora-agents dependency not found in {path}") - return match.group(1) - - -root_version = read_version("pyproject.toml") -compat_pyproject = "compat/agora-agent-server-sdk/pyproject.toml" -compat_version = read_version(compat_pyproject) -compat_dependency = read_compat_dependency(compat_pyproject) - -if compat_version != root_version: - fail(f"Compat package version ({compat_version}) must match root package version ({root_version}).") - -expected_dependency = f">={root_version},<3.0.0" -if compat_dependency != expected_dependency: - fail(f"Compat package dependency on agora-agents ({compat_dependency}) must be {expected_dependency}.") - -release_workflow = Path(".github/workflows/release.yml").read_text() -required_workflow_markers = [ - ("contents: write", "release workflow must have contents: write so it can create GitHub releases"), - ("gh release create", "release workflow must create a GitHub release when one does not exist"), - ("gh release edit", "release workflow must update an existing GitHub release"), - ("release_notes.md", "release workflow must generate and use a release notes file"), -] - -for marker, message in required_workflow_markers: - if marker not in release_workflow: - fail(message) - -print("Release metadata and workflow checks passed.") diff --git a/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py b/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py index fb58a36..40dbb02 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py @@ -5,7 +5,6 @@ import pydantic from ...core.pydantic_utilities import IS_PYDANTIC_V2 from ...core.unchecked_base_model import UncheckedBaseModel -from ...types.asr_language import AsrLanguage from .start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig from .start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness from .start_agents_request_properties_turn_detection_interrupt_mode import ( @@ -19,11 +18,6 @@ class StartAgentsRequestPropertiesTurnDetection(UncheckedBaseModel): Conversation turn detection settings. Controls the logic for voice activity detection and conversation turn determination. This object has no effect when `mllm.enable` is true; use `mllm.turn_detection` instead. """ - language: typing.Optional[AsrLanguage] = pydantic.Field(default=None) - """ - BCP-47 language tag identifying the primary language used for agent interaction. - """ - mode: typing.Optional[typing.Literal["default"]] = pydantic.Field(default=None) """ Conversation turn detection mode: diff --git a/src/agora_agent/core/client_wrapper.py b/src/agora_agent/core/client_wrapper.py index a8efe07..2df9814 100644 --- a/src/agora_agent/core/client_wrapper.py +++ b/src/agora_agent/core/client_wrapper.py @@ -26,10 +26,10 @@ def __init__( def get_headers(self) -> typing.Dict[str, str]: headers: typing.Dict[str, str] = { - "User-Agent": "agora-agents/v2.1.1", + "User-Agent": "agora-agents/v2.1.2", "X-Fern-Language": "Python", "X-Fern-SDK-Name": "agora-agents", - "X-Fern-SDK-Version": "v2.1.1", + "X-Fern-SDK-Version": "v2.1.2", **(self.get_custom_headers() or {}), } headers["Authorization"] = httpx.BasicAuth(self._get_username(), self._get_password())._auth_header diff --git a/src/agora_agent/types/asr.py b/src/agora_agent/types/asr.py index f08086f..1f2225d 100644 --- a/src/agora_agent/types/asr.py +++ b/src/agora_agent/types/asr.py @@ -54,7 +54,6 @@ class Asr_Deepgram(UncheckedBaseModel): vendor: typing.Literal["deepgram"] = "deepgram" language: typing.Optional[AsrLanguage] = None params: DeepgramAsrParams - keyterm: typing.Optional[str] = None if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 diff --git a/src/agora_agent/types/deepgram_asr.py b/src/agora_agent/types/deepgram_asr.py index 1c79c7b..723cd86 100644 --- a/src/agora_agent/types/deepgram_asr.py +++ b/src/agora_agent/types/deepgram_asr.py @@ -16,10 +16,6 @@ class DeepgramAsr(UncheckedBaseModel): language: typing.Optional[AsrLanguage] = None params: DeepgramAsrParams - keyterm: typing.Optional[str] = pydantic.Field(default=None) - """ - Boost specialized terms and brands for preset-backed Deepgram usage. - """ if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 diff --git a/src/agora_agent/types/deepgram_asr_params.py b/src/agora_agent/types/deepgram_asr_params.py index 259958e..6688333 100644 --- a/src/agora_agent/types/deepgram_asr_params.py +++ b/src/agora_agent/types/deepgram_asr_params.py @@ -34,7 +34,7 @@ class DeepgramAsrParams(UncheckedBaseModel): keyterm: typing.Optional[str] = pydantic.Field(default=None) """ - Boost specialized terms and brands + Boost specialized terms and brands for Deepgram. """ if IS_PYDANTIC_V2: From 83e9b9c3fc79f1d7a578de641494a08e54a4468b Mon Sep 17 00:00:00 2001 From: fern-api <115122769+fern-api[bot]@users.noreply.github.com> Date: Thu, 4 Jun 2026 20:31:14 +0000 Subject: [PATCH 04/14] [fern-replay] Applied customizations Patches applied (14): - patch-7c2d9d99: feat(agentkit): align session options and token uid handling - patch-7465fada: fix(agentkit): resolve Python session typing issues - patch-fae1249a: Re-export agora-agents API from legacy PyPI compatibility package The compat distribution delegates to agora_agent via __getattr__ and documents both import paths in its README. - patch-49af6f65: Align AgentKit TTS provider options with docs - patch-bad47d96: Align AgentKit provider BYOK parameter requirements - patch-434c8af1: Align AgentKit LLM and ASR vendor validation - patch-968e1f03: Restrict managed OpenAI LLM models in AgentKit - patch-676b93b3: Align managed vendor validation with generated core shapes - patch-8d52340e: fix(agentkit): flatten Deepgram TTS passthrough params - patch-cb9ab8b8: docs(agentkit): align OpenAI TTS instructions support - patch-299e4bd9: fix(agentkit): resolve provider config type checks - patch-583eccc0: Move AgentKit language to turn detection - patch-bed29b6b: chore: bump Python packages to 2.1.0 - patch-776f7c4a: Fix vendor validation matrix for presets, pipeline_id, and deprecation path Patches with unresolved conflicts (28): - patch-6e30398b: chore(agentkit): bump to v1.5.0 and expose v2.7 type aliases - patch-9df782b4: feat(agentkit): update MLLM and LLM vendor wrappers for v2.7 - patch-26706d73: feat(agentkit): add GenericAvatar and session-aware avatar validation - patch-9f491c63: feat(agentkit): update Agent builder and session lifecycle for v2.7 - patch-64703bda: test(agentkit): add custom tests for v1.5.0 AgentKit behavior - patch-6c20f076: docs(agentkit): update v1.5.0 guides, reference, and changelog - patch-eaec58eb: refactor(agentkit): align deprecated vendor aliases with canonical names - patch-20245632: feat(agentkit): export type aliases and avatar token helpers - patch-972dd5bd: updated docs - patch-4323b470: rename python package to agora-agents - patch-d29165c4: make python compat package publishable - patch-fc9d93c3: Document agora-agents PyPI install name and migration notes - patch-44c21c14: Re-export AgentKit symbols from agora_agent package root Extend __getattr__ and __all__ so vendor classes, presets, and helpers are importable via `from agora_agent import ...`. Add tests and update class docstring examples to use the root import path. - patch-87fc4488: Update docs to import from agora_agent package root - patch-923cf954: Prioritize app credentials and builder in Python docs Rewrite getting-started auth and quick-start for app credentials with the builder API. De-emphasize presets and align index, BYOK, and README with the recommended onboarding path. - patch-d475306b: Move package rename guidance to installation docs and protect manual paths in Fern ignore. Consolidate migration notes into the installation guide with next-step links, add a brief README pointer, and exclude README, compat, and workflow files from Fern generation. - patch-c9355576: Streamline Python docs and README for app-credentials-first onboarding. Remove duplicated low-level client examples from the README, de-emphasize legacy auth modes, refocus the low-level API guide on AgentKit with telephony escape hatches, and update Agora-managed model terminology. - patch-98ecb4d3: Add Groq, Vertex AI, Bedrock, Dify, and Custom LLM vendor helpers. Introduce named LLM vendor classes with correct request serialization, export them from the package root, and add tests covering each provider's config shape. - patch-a5097b8d: Document new LLM vendors and tighten onboarding docs. Add Groq, Vertex AI, Bedrock, Dify, and Custom LLM to vendor references, simplify README and index navigation, and align quick-start and terminology with Agora-managed model language. - patch-7d30c9dd: Add AgentKit ASR interaction language handling - patch-a95214eb: Document AgentKit ASR language and STT params - patch-eeac05d0: Move prompt and greeting docs to vendor config - patch-a94bac6d: Align AgentKit provider wrappers with regenerated core schemas - patch-198f367f: Update AgentKit TTS provider docs and examples - patch-96afe786: align v2.1 provider docs with AgentKit validation - patch-617ee134: feat(agentkit): support agent-level pipeline_id - patch-8e22e6d0: udpated agent docs - patch-b76a7006: Bump Python SDK version metadata and request headers to v2.1.1 Run `fern-replay resolve` to apply these customizations. Patches absorbed by generator (5): - patch-b7f0c36c: feat(agentkit): release v2.0.0 updates - patch-4d32368c: Add compat-build CI job and harden dual-package PyPI publish Build and verify the compat wheel re-exports, gate publish on compat-build, simplify version checks with poetry version, wait for primary package on PyPI, and retry compat publish on failure. - patch-20109390: Fix PyPI publish auth and explicitly protect release workflow in Fern ignore. Use PYPI_API_TOKEN for primary and compat Poetry publishes, matching the v1.4.1 release flow, and list release.yml explicitly in .fernignore. - patch-0297a70e: Update AgentKit v2.1 provider docs and examples - patch-c9022354: docs(agentkit): align TTS provider reference fields The generator now produces these customizations natively. --- .fern/replay.lock | 17991 +++++++++++++++- docs/reference/vendors.md | 1 + scripts/check_release_workflow.py | 54 + src/agora_agent/agentkit/agent.py | 19 + src/agora_agent/agentkit/agent_session.py | 1 + src/agora_agent/agentkit/vendors/avatar.py | 43 + src/agora_agent/agentkit/vendors/llm.py | 5 +- src/agora_agent/agentkit/vendors/mllm.py | 1 + src/agora_agent/agentkit/vendors/stt.py | 3 + src/agora_agent/agentkit/vendors/tts.py | 6 + ...gents_request_properties_turn_detection.py | 6 + tests/custom/test_agentkit_agent.py | 298 + tests/custom/test_agentkit_session.py | 383 + tests/custom/test_agentkit_vendors.py | 122 + 14 files changed, 18930 insertions(+), 3 deletions(-) create mode 100644 scripts/check_release_workflow.py create mode 100644 tests/custom/test_agentkit_agent.py create mode 100644 tests/custom/test_agentkit_session.py create mode 100644 tests/custom/test_agentkit_vendors.py diff --git a/.fern/replay.lock b/.fern/replay.lock index 536b6e4..a435ef4 100644 --- a/.fern/replay.lock +++ b/.fern/replay.lock @@ -6,5 +6,17992 @@ generations: timestamp: 2026-05-20T20:38:02.180Z cli_version: unknown generator_versions: {} -current_generation: a217c8ecfd919345831eebaca8295e292d65ebcf -patches: [] + - commit_sha: b66d871314ca0e5929cb9c9095949a7fd5e856a7 + tree_hash: db7756fbc0a5c6923371615dd752c8e17b2d828b + timestamp: 2026-06-04T20:30:41.901Z + cli_version: unknown + generator_versions: + fernapi/fern-python-sdk: 4.37.0 +current_generation: b66d871314ca0e5929cb9c9095949a7fd5e856a7 +patches: + - id: patch-6e30398b + content_hash: sha256:e99898e508e2d6cb9f134cc33e0b73c1c8acb845f5887924e0e38031a6e089c0 + original_commit: 6e30398b5dc6e8ff2681a442a4d6a49c7d866032 + original_message: "chore(agentkit): bump to v1.5.0 and expose v2.7 type aliases" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/__init__.py + - src/agora_agent/agentkit/vendors/__init__.py + patch_content: | + diff --git a/src/agora_agent/agentkit/__init__.py b/src/agora_agent/agentkit/__init__.py + index 1942bce..5ceda66 100644 + --- a/src/agora_agent/agentkit/__init__.py + +++ b/src/agora_agent/agentkit/__init__.py + @@ -1,13 +1,30 @@ + from .agent import ( + Agent, + + AgentConfig, + + AgentConfigUpdate, + + ConversationHistory, + + ConversationRole, + + ConversationSessionTurn, + + ConversationTurn, + + ConversationTurns, + StartAgentsRequestProperties, + + AvatarConfig, + + AvatarVendor, + GeofenceConfig, + + LlmConfig, + + LlmStyle, + + MllmConfig, + + MllmVendor, + RtcConfig, + + SttConfig, + + SttVendor, + + TtsConfig, + FillerWordsConfig, + FillerWordsTrigger, + FillerWordsTriggerFixedTimeConfig, + FillerWordsContent, + FillerWordsContentStaticConfig, + + FillerWordsContentSelectionRule, + TurnDetectionConfig, + TurnDetectionNestedConfig, + StartOfSpeechConfig, + @@ -37,9 +54,14 @@ from .agent import ( + InterruptionMode, + MllmTurnDetectionConfig, + MllmTurnDetectionMode, + + Labels, + LlmGreetingConfigs, + LlmGreetingConfigsMode, + McpServersItem, + + SessionInfo, + + SessionListResponse, + + SessionSummary, + + SpeakPriority, + ) + from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from ..agent_management.types.agent_think_agent_management_response import ( + @@ -57,8 +79,10 @@ from ..agent_management.types.agent_think_agent_management_request_on_speaking_a + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + @@ -112,6 +136,7 @@ from .vendors import ( + FishAudioTTS, + Gemini, + GeminiLive, + + GenericAvatar, + GoogleSTT, + GoogleTTS, + HeyGenAvatar, + @@ -132,14 +157,27 @@ from .vendors import ( + SarvamTTS, + SpeechmaticsSTT, + VertexAI, + + XaiGrok, + + XaiRealtime, + LiveAvatarAvatar, + ) + + __all__ = [ + "Agent", + + "AgentConfig", + + "AgentConfigUpdate", + # Return type of Agent.to_properties() + "StartAgentsRequestProperties", + # Top-level config types + + "LlmConfig", + + "LlmStyle", + + "SttConfig", + + "SttVendor", + + "TtsConfig", + + "MllmConfig", + + "MllmVendor", + + "AvatarConfig", + + "AvatarVendor", + "GeofenceConfig", + "RtcConfig", + "FillerWordsConfig", + @@ -147,6 +185,7 @@ __all__ = [ + "FillerWordsTriggerFixedTimeConfig", + "FillerWordsContent", + "FillerWordsContentStaticConfig", + + "FillerWordsContentSelectionRule", + # Turn detection types + "TurnDetectionConfig", + "TurnDetectionNestedConfig", + @@ -181,6 +220,7 @@ __all__ = [ + "InterruptionMode", + "MllmTurnDetectionConfig", + "MllmTurnDetectionMode", + + "Labels", + # Type-safe constants + "DataChannel", + "AudioScenario", + @@ -197,6 +237,15 @@ __all__ = [ + "AgentSession", + "AsyncAgentSession", + "AgentSessionOptions", + + "SessionInfo", + + "SessionListResponse", + + "SessionSummary", + + "ConversationHistory", + + "ConversationTurn", + + "ConversationRole", + + "ConversationTurns", + + "ConversationSessionTurn", + + "SpeakPriority", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + @@ -253,14 +302,19 @@ __all__ = [ + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + + "XaiGrok", + + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + + "GenericAvatar", + "is_heygen_avatar", + "is_live_avatar_avatar", + "is_akool_avatar", + "is_anam_avatar", + + "is_generic_avatar", + + "is_rtc_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + diff --git a/src/agora_agent/agentkit/vendors/__init__.py b/src/agora_agent/agentkit/vendors/__init__.py + index 0320843..689eab1 100644 + --- a/src/agora_agent/agentkit/vendors/__init__.py + +++ b/src/agora_agent/agentkit/vendors/__init__.py + @@ -11,9 +11,9 @@ from .base import ( + OpenAISampleRate, + SampleRate, + ) + -from .avatar import AkoolAvatar, AnamAvatar, HeyGenAvatar, LiveAvatarAvatar + +from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + -from .mllm import GeminiLive, OpenAIRealtime, VertexAI + +from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok, XaiRealtime + from .stt import ( + AmazonSTT, + AresSTT, + @@ -82,8 +82,11 @@ __all__ = [ + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + + "XaiGrok", + + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + + "GenericAvatar", + ] + theirs_snapshot: + src/agora_agent/agentkit/__init__.py: | + from .agent import ( + Agent, + AgentConfig, + AgentConfigUpdate, + ConversationHistory, + ConversationRole, + ConversationSessionTurn, + ConversationTurn, + ConversationTurns, + StartAgentsRequestProperties, + AvatarConfig, + AvatarVendor, + GeofenceConfig, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, + RtcConfig, + SttConfig, + SttVendor, + TtsConfig, + FillerWordsConfig, + FillerWordsTrigger, + FillerWordsTriggerFixedTimeConfig, + FillerWordsContent, + FillerWordsContentStaticConfig, + FillerWordsContentSelectionRule, + TurnDetectionConfig, + TurnDetectionNestedConfig, + StartOfSpeechConfig, + StartOfSpeechMode, + StartOfSpeechVadConfig, + StartOfSpeechKeywordsConfig, + StartOfSpeechDisabledConfig, + StartOfSpeechDisabledConfigStrategy, + EndOfSpeechConfig, + EndOfSpeechMode, + EndOfSpeechVadConfig, + EndOfSpeechSemanticConfig, + TurnDetectionType, + InterruptMode, + Eagerness, + SalConfig, + SalMode, + AdvancedFeatures, + SessionParams, + SessionParamsInput, + SilenceConfig, + SilenceAction, + FarewellConfig, + ParametersDataChannel, + ParametersAudioScenario, + InterruptionConfig, + InterruptionMode, + MllmTurnDetectionConfig, + MllmTurnDetectionMode, + Labels, + LlmGreetingConfigs, + LlmGreetingConfigsMode, + McpServersItem, + SessionInfo, + SessionListResponse, + SessionSummary, + SpeakPriority, + ) + from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .constants import ( + DataChannel, + AudioScenario, + SilenceActionValues, + SalModeValues, + GeofenceArea, + GeofenceExcludeArea, + FillerWordsSelectionRule, + TurnDetectionTypeValues, + ) + from .token import ( + GenerateConvoAITokenOptions, + GenerateTokenOptions, + MAX_EXPIRY_SECONDS, + generate_convo_ai_token, + generate_rtc_token, + expires_in_hours, + expires_in_minutes, + ) + from .presets import ( + AgentPresets, + DeepgramPresetModels, + MiniMaxPresetModels, + OpenAIPresetModels, + OpenAITtsPresetModels, + normalize_preset_input, + ) + from .vendors import ( + AkoolAvatar, + AmazonSTT, + AmazonTTS, + AnamAvatar, + Anthropic, + AresSTT, + AssemblyAISTT, + AzureOpenAI, + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + CartesiaTTS, + DeepgramSTT, + DeepgramTTS, + ElevenLabsSampleRate, + ElevenLabsTTS, + FishAudioTTS, + Gemini, + GeminiLive, + GenericAvatar, + GoogleSTT, + GoogleTTS, + HeyGenAvatar, + HumeAITTS, + MicrosoftSampleRate, + MicrosoftSTT, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAI, + OpenAIRealtime, + OpenAISampleRate, + OpenAISTT, + OpenAITTS, + RimeTTS, + SampleRate, + SarvamSTT, + SarvamTTS, + SpeechmaticsSTT, + VertexAI, + XaiGrok, + XaiRealtime, + LiveAvatarAvatar, + ) + + __all__ = [ + "Agent", + "AgentConfig", + "AgentConfigUpdate", + # Return type of Agent.to_properties() + "StartAgentsRequestProperties", + # Top-level config types + "LlmConfig", + "LlmStyle", + "SttConfig", + "SttVendor", + "TtsConfig", + "MllmConfig", + "MllmVendor", + "AvatarConfig", + "AvatarVendor", + "GeofenceConfig", + "RtcConfig", + "FillerWordsConfig", + "FillerWordsTrigger", + "FillerWordsTriggerFixedTimeConfig", + "FillerWordsContent", + "FillerWordsContentStaticConfig", + "FillerWordsContentSelectionRule", + # Turn detection types + "TurnDetectionConfig", + "TurnDetectionNestedConfig", + "StartOfSpeechConfig", + "StartOfSpeechMode", + "StartOfSpeechVadConfig", + "StartOfSpeechKeywordsConfig", + "StartOfSpeechDisabledConfig", + "StartOfSpeechDisabledConfigStrategy", + "EndOfSpeechConfig", + "EndOfSpeechMode", + "EndOfSpeechVadConfig", + "EndOfSpeechSemanticConfig", + # Deprecated turn detection types + "TurnDetectionType", + "InterruptMode", + "Eagerness", + # SAL types + "SalConfig", + "SalMode", + # Advanced features + "AdvancedFeatures", + # Session parameters types + "SessionParams", + "SessionParamsInput", + "SilenceConfig", + "SilenceAction", + "FarewellConfig", + "ParametersDataChannel", + "ParametersAudioScenario", + "InterruptionConfig", + "InterruptionMode", + "MllmTurnDetectionConfig", + "MllmTurnDetectionMode", + "Labels", + # Type-safe constants + "DataChannel", + "AudioScenario", + "SilenceActionValues", + "SalModeValues", + "GeofenceArea", + "GeofenceExcludeArea", + "FillerWordsSelectionRule", + "TurnDetectionTypeValues", + # LLM sub-types + "LlmGreetingConfigs", + "LlmGreetingConfigsMode", + "McpServersItem", + "AgentSession", + "AsyncAgentSession", + "AgentSessionOptions", + "SessionInfo", + "SessionListResponse", + "SessionSummary", + "ConversationHistory", + "ConversationTurn", + "ConversationRole", + "ConversationTurns", + "ConversationSessionTurn", + "SpeakPriority", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + "AgentThinkRequestOnSpeakingAction", + "AgentPresets", + "DeepgramPresetModels", + "OpenAIPresetModels", + "OpenAITtsPresetModels", + "MiniMaxPresetModels", + "normalize_preset_input", + "generate_rtc_token", + "GenerateTokenOptions", + "generate_convo_ai_token", + "GenerateConvoAITokenOptions", + "MAX_EXPIRY_SECONDS", + "expires_in_hours", + "expires_in_minutes", + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + "is_heygen_avatar", + "is_live_avatar_avatar", + "is_akool_avatar", + "is_anam_avatar", + "is_generic_avatar", + "is_rtc_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + src/agora_agent/agentkit/vendors/__init__.py: | + from .base import ( + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + ElevenLabsSampleRate, + GoogleTTSSampleRate, + MicrosoftSampleRate, + OpenAISampleRate, + SampleRate, + ) + from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok, XaiRealtime + from .stt import ( + AmazonSTT, + AresSTT, + AssemblyAISTT, + DeepgramSTT, + GoogleSTT, + MicrosoftSTT, + OpenAISTT, + SarvamSTT, + SpeechmaticsSTT, + ) + from .tts import ( + AmazonTTS, + CartesiaTTS, + DeepgramTTS, + ElevenLabsTTS, + FishAudioTTS, + GoogleTTS, + HumeAITTS, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAITTS, + RimeTTS, + SarvamTTS, + ) + + __all__ = [ + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "GoogleTTSSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + ] + status: unresolved + - id: patch-9df782b4 + content_hash: sha256:84c08fe3239d2ecb0b0a3ddd33b0dce4e7b012125be797aa83ca12893363b565 + original_commit: 9df782b46d872599f103078e30c5ded2053f2517 + original_message: "feat(agentkit): update MLLM and LLM vendor wrappers for v2.7" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/vendors/llm.py + - src/agora_agent/agentkit/vendors/mllm.py + patch_content: |+ + From 9df782b46d872599f103078e30c5ded2053f2517 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 20 May 2026 20:57:54 -0400 + Subject: [PATCH] feat(agentkit): update MLLM and LLM vendor wrappers for v2.7 + + Adds xAI Grok Realtime and Vertex AI MLLM wrappers, and aligns MLLM + config serialization with the generated core types. LLM vendors now + accept typed greeting_configs and serialize them through the generated + model shape, including interruptable. + --- + src/agora_agent/agentkit/vendors/llm.py | 31 ++++-- + src/agora_agent/agentkit/vendors/mllm.py | 118 +++++++++++++++++------ + 2 files changed, 113 insertions(+), 36 deletions(-) + + diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py + index 7465c9f..6f74b43 100644 + --- a/src/agora_agent/agentkit/vendors/llm.py + +++ b/src/agora_agent/agentkit/vendors/llm.py + @@ -1,9 +1,14 @@ + -from typing import Any, Dict, List, Optional + +from typing import Any, Dict, List, Optional, Union + + from pydantic import BaseModel, ConfigDict, Field + + +from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + + StartAgentsRequestPropertiesLlmGreetingConfigs, + +) + from .base import BaseLLM + + +LlmGreetingConfigs = Union[StartAgentsRequestPropertiesLlmGreetingConfigs, Dict[str, Any]] + + + + def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Ensure each MCP server has transport set (API requires it). Default to streamable_http.""" + @@ -15,6 +20,14 @@ def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]] + result.append(item) + return result + + + + +def _dump_optional_model(value: Any) -> Any: + + if hasattr(value, "model_dump"): + + return value.model_dump(exclude_none=True) + + if hasattr(value, "dict"): + + return value.dict(exclude_none=True) + + return value + + + class OpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + @@ -31,7 +44,7 @@ class OpenAIOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -74,7 +87,7 @@ class OpenAI(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + @@ -104,7 +117,7 @@ class AzureOpenAIOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -150,7 +163,7 @@ class AzureOpenAI(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.mcp_servers is not None: + @@ -177,7 +190,7 @@ class AnthropicOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -216,7 +229,7 @@ class Anthropic(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + @@ -246,7 +259,7 @@ class GeminiOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -287,7 +300,7 @@ class Gemini(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py + index 5f6f940..cd6cd07 100644 + --- a/src/agora_agent/agentkit/vendors/mllm.py + +++ b/src/agora_agent/agentkit/vendors/mllm.py + @@ -1,3 +1,4 @@ + +import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + @@ -22,9 +23,7 @@ class OpenAIRealtimeOptions(BaseModel): + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + @@ -53,18 +52,97 @@ class OpenAIRealtime(BaseMLLM): + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + - if self.options.predefined_tools is not None: + - config["predefined_tools"] = self.options.predefined_tools + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + - if self.options.max_history is not None: + - config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + +# xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + +# is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + + + +class XaiGrokOptions(BaseModel): + + model_config = ConfigDict(extra="forbid") + + + + api_key: str = Field(..., description="xAI API key") + + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + + + +class XaiGrok(BaseMLLM): + + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + + + def __init__(self, **kwargs: Any): + + self.options = XaiGrokOptions(**kwargs) + + + + def to_config(self) -> Dict[str, Any]: + + params: Dict[str, Any] = dict(self.options.params or {}) + + if self.options.voice is not None: + + params["voice"] = self.options.voice + + if self.options.language is not None: + + params["language"] = self.options.language + + if self.options.sample_rate is not None: + + params["sample_rate"] = self.options.sample_rate + + + + config: Dict[str, Any] = { + + "vendor": "xai", + + "api_key": self.options.api_key, + + "url": self.options.url, + + "params": params, + + } + + + + if self.options.greeting_message is not None: + + config["greeting_message"] = self.options.greeting_message + + if self.options.input_modalities is not None: + + config["input_modalities"] = self.options.input_modalities + + if self.options.output_modalities is not None: + + config["output_modalities"] = self.options.output_modalities + + if self.options.messages is not None: + + config["messages"] = self.options.messages + + if self.options.failure_message is not None: + + config["failure_message"] = self.options.failure_message + + if self.options.turn_detection is not None: + + config["turn_detection"] = self.options.turn_detection + + + + return config + + + + + +class XaiRealtimeOptions(XaiGrokOptions): + + """Deprecated: use :class:`XaiGrokOptions` instead.""" + + + + def __init__(self, **data: Any): + + warnings.warn( + + "XaiRealtimeOptions is deprecated; use XaiGrokOptions instead.", + + DeprecationWarning, + + stacklevel=2, + + ) + + super().__init__(**data) + + + + + +class XaiRealtime(XaiGrok): + + """Deprecated: use :class:`XaiGrok` instead.""" + + + + def __init__(self, **kwargs: Any): + + warnings.warn( + + "XaiRealtime is deprecated; use XaiGrok instead.", + + DeprecationWarning, + + stacklevel=2, + + ) + + super().__init__(**kwargs) + + + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + @@ -81,28 +159,24 @@ class VertexAIOptions(BaseModel): + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + - params: Dict[str, Any] = { + - "model": self.options.model, + - "project_id": self.options.project_id, + - "location": self.options.location, + - "adc_credentials_string": self.options.adc_credentials_string, + - } + - + + # additional_params spread first so that explicit fields always win, + + # matching the TypeScript SDK. + + params: Dict[str, Any] = dict(self.options.additional_params or {}) + + params["model"] = self.options.model + + params["project_id"] = self.options.project_id + + params["location"] = self.options.location + + params["adc_credentials_string"] = self.options.adc_credentials_string + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + - if self.options.additional_params is not None: + - params.update(self.options.additional_params) + + config: Dict[str, Any] = { + "vendor": "vertexai", + @@ -119,12 +193,8 @@ class VertexAI(BaseMLLM): + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + - if self.options.predefined_tools is not None: + - config["predefined_tools"] = self.options.predefined_tools + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + - if self.options.max_history is not None: + - config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + @@ -145,9 +215,7 @@ class GeminiLiveOptions(BaseModel): + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + @@ -179,12 +247,8 @@ class GeminiLive(BaseMLLM): + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + - if self.options.predefined_tools is not None: + - config["predefined_tools"] = self.options.predefined_tools + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + - if self.options.max_history is not None: + - config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/vendors/llm.py: | + from typing import Any, Dict, List, Optional, Union + + from pydantic import BaseModel, ConfigDict, Field + + from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + StartAgentsRequestPropertiesLlmGreetingConfigs, + ) + from .base import BaseLLM + + LlmGreetingConfigs = Union[StartAgentsRequestPropertiesLlmGreetingConfigs, Dict[str, Any]] + + + def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Ensure each MCP server has transport set (API requires it). Default to streamable_http.""" + result = [] + for s in servers: + item = dict(s) + if item.get("transport") is None: + item["transport"] = "streamable_http" + result.append(item) + return result + + + def _dump_optional_model(value: Any) -> Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + class OpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: Optional[str] = Field(default=None, description="OpenAI API key") + model: str = Field(default="gpt-4o-mini", description="Model name") + base_url: Optional[str] = Field(default=None, description="Custom base URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + max_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class OpenAI(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # model is the default; explicit params entries extend/override it. + # This matches the TS SDK behaviour: { model, ...params }. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + + # Named fields take precedence over anything in the generic params dict. + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.base_url or "https://api.openai.com/v1/chat/completions", + "params": params, + "style": "openai", + "input_modalities": self.options.input_modalities or ["text"], + } + if self.options.api_key is not None: + config["api_key"] = self.options.api_key + if self.options.headers is not None: + config["headers"] = self.options.headers + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class AzureOpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Azure OpenAI API key") + endpoint: str = Field(..., description="Azure endpoint URL") + deployment_name: str = Field(..., description="Azure deployment name") + api_version: str = Field(default="2024-08-01-preview", description="Azure API version") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + max_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class AzureOpenAI(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AzureOpenAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + url = ( + f"{self.options.endpoint}/openai/deployments/" + f"{self.options.deployment_name}/chat/completions" + f"?api-version={self.options.api_version}" + ) + config: Dict[str, Any] = { + "url": url, + "api_key": self.options.api_key, + "vendor": self.options.vendor or "azure", + "style": "openai", + "input_modalities": self.options.input_modalities or ["text"], + } + + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if params: + config["params"] = params + if self.options.headers is not None: + config["headers"] = self.options.headers + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class AnthropicOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anthropic API key") + model: str = Field(default="claude-3-5-sonnet-20241022", description="Model name") + url: Optional[str] = Field(default=None, description="Custom API endpoint URL") + max_tokens: Optional[int] = Field(default=None, gt=0) + temperature: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class Anthropic(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AnthropicOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.url or "https://api.anthropic.com/v1/messages", + "api_key": self.options.api_key, + "params": params, + "style": "anthropic", + "input_modalities": self.options.input_modalities or ["text"], + } + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class GeminiOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google AI API key") + model: str = Field(default="gemini-2.0-flash-exp", description="Model name") + url: Optional[str] = Field(default=None, description="Custom API endpoint URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_k: Optional[int] = Field(default=None, gt=0) + max_output_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class Gemini(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + if self.options.top_k is not None: + params["top_k"] = self.options.top_k + if self.options.max_output_tokens is not None: + params["max_output_tokens"] = self.options.max_output_tokens + + config: Dict[str, Any] = { + "url": self.options.url or "https://generativelanguage.googleapis.com/v1beta/models", + "api_key": self.options.api_key, + "params": params, + "style": "gemini", + "input_modalities": self.options.input_modalities or ["text"], + } + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + src/agora_agent/agentkit/vendors/mllm.py: | + import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + + from ...agents.types.start_agents_request_properties_mllm_turn_detection import ( + StartAgentsRequestPropertiesMllmTurnDetection, + ) + from .base import BaseMLLM + + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + + + class OpenAIRealtimeOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="OpenAI API key") + model: Optional[str] = Field(default=None, description="Model name (e.g., gpt-4o-realtime-preview)") + url: Optional[str] = Field(default=None, description="WebSocket URL") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIRealtimeOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config: Dict[str, Any] = { + "vendor": "openai", + "api_key": self.options.api_key, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.model is not None: + params = {"model": self.options.model} + if self.options.params is not None: + params.update(self.options.params) + config["params"] = params + elif self.options.params is not None: + config["params"] = self.options.params + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + # xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + # is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + class XaiGrokOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="xAI API key") + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + class XaiGrok(BaseMLLM): + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + def __init__(self, **kwargs: Any): + self.options = XaiGrokOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.language is not None: + params["language"] = self.options.language + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate + + config: Dict[str, Any] = { + "vendor": "xai", + "api_key": self.options.api_key, + "url": self.options.url, + "params": params, + } + + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class XaiRealtimeOptions(XaiGrokOptions): + """Deprecated: use :class:`XaiGrokOptions` instead.""" + + def __init__(self, **data: Any): + warnings.warn( + "XaiRealtimeOptions is deprecated; use XaiGrokOptions instead.", + DeprecationWarning, + stacklevel=2, + ) + super().__init__(**data) + + + class XaiRealtime(XaiGrok): + """Deprecated: use :class:`XaiGrok` instead.""" + + def __init__(self, **kwargs: Any): + warnings.warn( + "XaiRealtime is deprecated; use XaiGrok instead.", + DeprecationWarning, + stacklevel=2, + ) + super().__init__(**kwargs) + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + model: str = Field(..., description="Model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud location/region") + adc_credentials_string: str = Field(..., description="Application Default Credentials JSON string") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name (e.g., Aoede, Charon)") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # additional_params spread first so that explicit fields always win, + # matching the TypeScript SDK. + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["model"] = self.options.model + params["project_id"] = self.options.project_id + params["location"] = self.options.location + params["adc_credentials_string"] = self.options.adc_credentials_string + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "vertexai", + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class GeminiLiveOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google API key") + model: str = Field(..., description="Gemini Live model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiLiveOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = {} + if self.options.additional_params is not None: + params.update(self.options.additional_params) + params["model"] = self.options.model + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "gemini", + "api_key": self.options.api_key, + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + status: unresolved + - id: patch-26706d73 + content_hash: sha256:a9551e0b774b96e7734e9faa7d770611861cf443837428272ef75710447238da + original_commit: 26706d73ae15d860d57daf926837632c01be7f10 + original_message: "feat(agentkit): add GenericAvatar and session-aware avatar validation" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/avatar_types.py + - src/agora_agent/agentkit/vendors/avatar.py + patch_content: |+ + From 26706d73ae15d860d57daf926837632c01be7f10 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 20 May 2026 20:59:22 -0400 + Subject: [PATCH] feat(agentkit): add GenericAvatar and session-aware avatar + validation + + Adds the GenericAvatar vendor wrapper and extends avatar validation + helpers for generic and RTC-backed avatars. Session-derived fields such + as agora_appid, agora_channel, and agora_token can now be validated + after AgentSession enrichment. + --- + src/agora_agent/agentkit/avatar_types.py | 35 +++++++++++++++++- + src/agora_agent/agentkit/vendors/avatar.py | 42 ++++++++++++++++++++++ + 2 files changed, 76 insertions(+), 1 deletion(-) + + diff --git a/src/agora_agent/agentkit/avatar_types.py b/src/agora_agent/agentkit/avatar_types.py + index 9e132a9..a04809c 100644 + --- a/src/agora_agent/agentkit/avatar_types.py + +++ b/src/agora_agent/agentkit/avatar_types.py + @@ -17,7 +17,21 @@ def is_anam_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "anam" + + + -def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: + +def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + + return config.get("vendor") == "generic" + + + + + +def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + + params = config.get("params", {}) + + return isinstance(params, dict) and bool(params.get("agora_uid")) and ( + + is_heygen_avatar(config) or is_live_avatar_avatar(config) or is_generic_avatar(config) + + ) + + + + + +def validate_avatar_config( + + config: typing.Dict[str, typing.Any], + + require_session_fields: bool = False, + +) -> None: + """Validates avatar configuration at runtime. + + Parameters + @@ -45,6 +59,8 @@ def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: + f"Invalid quality for {label}: {params.get('quality')}. " + f"Must be one of: {', '.join(valid_qualities)}" + ) + + if require_session_fields and not params.get("agora_token"): + + raise ValueError(f"{label} avatar requires agora_token after session enrichment") + elif is_akool_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + @@ -53,6 +69,23 @@ def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Anam avatar requires api_key") + + elif is_generic_avatar(config): + + params = config.get("params", {}) + + if not params.get("api_key"): + + raise ValueError("Generic avatar requires api_key") + + if not params.get("api_base_url"): + + raise ValueError("Generic avatar requires api_base_url") + + if not params.get("avatar_id"): + + raise ValueError("Generic avatar requires avatar_id") + + if not params.get("agora_uid"): + + raise ValueError("Generic avatar requires agora_uid") + + if require_session_fields: + + if not params.get("agora_token"): + + raise ValueError("Generic avatar requires agora_token after session enrichment") + + if not params.get("agora_appid"): + + raise ValueError("Generic avatar requires agora_appid after session enrichment") + + if not params.get("agora_channel"): + + raise ValueError("Generic avatar requires agora_channel after session enrichment") + + + def validate_tts_sample_rate( + diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py + index b83a356..00cad8f 100644 + --- a/src/agora_agent/agentkit/vendors/avatar.py + +++ b/src/agora_agent/agentkit/vendors/avatar.py + @@ -132,6 +132,48 @@ class LiveAvatarAvatar(BaseAvatar): + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + +class GenericAvatarOptions(BaseModel): + + model_config = ConfigDict(extra="forbid") + + + + api_key: str = Field(..., description="Generic avatar provider API key") + + api_base_url: str = Field(..., description="Avatar provider API base URL") + + avatar_id: str = Field(..., description="Avatar ID") + + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + +class GenericAvatar(BaseAvatar): + + def __init__(self, **kwargs: Any): + + self.options = GenericAvatarOptions(**kwargs) + + + + @property + + def required_sample_rate(self) -> int: + + return 0 + + + + def to_config(self) -> Dict[str, Any]: + + params: Dict[str, Any] = { + + "api_key": self.options.api_key, + + "api_base_url": self.options.api_base_url, + + "avatar_id": self.options.avatar_id, + + "agora_uid": self.options.agora_uid, + + } + + + + if self.options.agora_appid is not None: + + params["agora_appid"] = self.options.agora_appid + + if self.options.agora_token is not None: + + params["agora_token"] = self.options.agora_token + + if self.options.agora_channel is not None: + + params["agora_channel"] = self.options.agora_channel + + if self.options.additional_params is not None: + + params = {**self.options.additional_params, **params} + + + + enable = self.options.enable if self.options.enable is not None else True + + return {"enable": enable, "vendor": "generic", "params": params} + + + + + class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/avatar_types.py: | + import typing + + + def is_heygen_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "heygen" + + + def is_live_avatar_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "liveavatar" + + + def is_akool_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "akool" + + + def is_anam_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "anam" + + + def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "generic" + + + def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + params = config.get("params", {}) + return isinstance(params, dict) and bool(params.get("agora_uid")) and ( + is_heygen_avatar(config) or is_live_avatar_avatar(config) or is_generic_avatar(config) + ) + + + def validate_avatar_config( + config: typing.Dict[str, typing.Any], + require_session_fields: bool = False, + ) -> None: + """Validates avatar configuration at runtime. + + Parameters + ---------- + config : dict + The avatar configuration dictionary. + + Raises + ------ + ValueError + If the configuration is invalid. + """ + if is_heygen_avatar(config) or is_live_avatar_avatar(config): + label = "HeyGen" if is_heygen_avatar(config) else "LiveAvatar" + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError(f"{label} avatar requires api_key") + if not params.get("quality"): + raise ValueError(f"{label} avatar requires quality (low, medium, or high)") + if not params.get("agora_uid"): + raise ValueError(f"{label} avatar requires agora_uid") + valid_qualities = ("low", "medium", "high") + if params.get("quality") not in valid_qualities: + raise ValueError( + f"Invalid quality for {label}: {params.get('quality')}. " + f"Must be one of: {', '.join(valid_qualities)}" + ) + if require_session_fields and not params.get("agora_token"): + raise ValueError(f"{label} avatar requires agora_token after session enrichment") + elif is_akool_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Akool avatar requires api_key") + elif is_anam_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Anam avatar requires api_key") + elif is_generic_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Generic avatar requires api_key") + if not params.get("api_base_url"): + raise ValueError("Generic avatar requires api_base_url") + if not params.get("avatar_id"): + raise ValueError("Generic avatar requires avatar_id") + if not params.get("agora_uid"): + raise ValueError("Generic avatar requires agora_uid") + if require_session_fields: + if not params.get("agora_token"): + raise ValueError("Generic avatar requires agora_token after session enrichment") + if not params.get("agora_appid"): + raise ValueError("Generic avatar requires agora_appid after session enrichment") + if not params.get("agora_channel"): + raise ValueError("Generic avatar requires agora_channel after session enrichment") + + + def validate_tts_sample_rate( + avatar_config: typing.Dict[str, typing.Any], + tts_sample_rate: int, + ) -> None: + """Validates that TTS sample rate is compatible with the avatar vendor. + + Different avatar vendors have specific sample rate requirements: + - HeyGen: ONLY supports 24,000 Hz + - Akool: ONLY supports 16,000 Hz + + Parameters + ---------- + avatar_config : dict + The avatar configuration dictionary. + tts_sample_rate : int + The sample rate from your TTS configuration (in Hz). + + Raises + ------ + ValueError + If TTS sample rate is incompatible with the avatar vendor. + """ + if is_heygen_avatar(avatar_config) or is_live_avatar_avatar(avatar_config): + if tts_sample_rate != 24000: + label = "HeyGen" if is_heygen_avatar(avatar_config) else "LiveAvatar" + raise ValueError( + f"{label} avatars ONLY support 24,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 24kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/overview" + ) + elif is_akool_avatar(avatar_config): + if tts_sample_rate != 16000: + raise ValueError( + f"Akool avatars ONLY support 16,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 16kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/akool" + ) + src/agora_agent/agentkit/vendors/avatar.py: | + import warnings + from typing import Any, Dict, Optional + + from pydantic import BaseModel, ConfigDict, Field, field_validator + + from .base import BaseAvatar + + HEYGEN_SAMPLE_RATE = 24000 + LIVEAVATAR_SAMPLE_RATE = 24000 + AKOOL_SAMPLE_RATE = 16000 + + + class HeyGenAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="HeyGen API key") + quality: str = Field(..., description="Avatar quality: low, medium, or high") + agora_uid: str = Field(..., description="Agora UID for the avatar stream") + agora_token: Optional[str] = Field(default=None, description="RTC token for avatar authentication") + avatar_id: Optional[str] = Field(default=None, description="HeyGen avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + disable_idle_timeout: Optional[bool] = Field(default=None, description="Whether to disable idle timeout") + activity_idle_timeout: Optional[int] = Field(default=None, description="Idle timeout in seconds") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + @field_validator("quality") + @classmethod + def validate_quality(cls, v: str) -> str: + valid = ("low", "medium", "high") + if v not in valid: + raise ValueError(f"Invalid quality '{v}'. Must be one of: {', '.join(valid)}") + return v + + class HeyGenAvatar(BaseAvatar): + """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + def __init__(self, **kwargs: Any): + warnings.warn( + "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + DeprecationWarning, + stacklevel=2, + ) + self.options = HeyGenAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return HEYGEN_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "heygen", "params": params} + + + class AkoolAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Akool API key") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + class AkoolAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AkoolAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return AKOOL_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "akool", "params": params} + + + class LiveAvatarAvatarOptions(HeyGenAvatarOptions): + pass + + + class LiveAvatarAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = LiveAvatarAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return LIVEAVATAR_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + class GenericAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Generic avatar provider API key") + api_base_url: str = Field(..., description="Avatar provider API base URL") + avatar_id: str = Field(..., description="Avatar ID") + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "api_base_url": self.options.api_base_url, + "avatar_id": self.options.avatar_id, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_appid is not None: + params["agora_appid"] = self.options.agora_appid + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.agora_channel is not None: + params["agora_channel"] = self.options.agora_channel + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "generic", "params": params} + + + class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anam API key") + persona_id: Optional[str] = Field(default=None, description="Persona ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + class AnamAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AnamAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.persona_id is not None: + params["persona_id"] = self.options.persona_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "anam", "params": params} + status: unresolved + - id: patch-9f491c63 + content_hash: sha256:d9811b2c5927be74f2125444dcf36642b88ad7be422019688cb0228093dce1d0 + original_commit: 9f491c63a964c13c67ba4af3708379e1b75a92d8 + original_message: "feat(agentkit): update Agent builder and session lifecycle for v2.7" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + patch_content: |+ + From 9f491c63a964c13c67ba4af3708379e1b75a92d8 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 20 May 2026 21:00:58 -0400 + Subject: [PATCH] feat(agentkit): update Agent builder and session lifecycle + for v2.7 + + Aligns Agent and AgentSession with the generated v2.7 request shape. + MLLM sessions no longer require TTS, LLM, or STT, and enabled avatars + are rejected when MLLM is configured. AgentSession now enriches generic + and RTC avatars with session context, auto-generates avatar tokens, + validates TTS sample rates from vendor-specific fields, and adds + paginated get_turns/get_all_turns helpers with fail-fast pagination + guards. + --- + src/agora_agent/agentkit/agent.py | 164 +++++++++++++-- + src/agora_agent/agentkit/agent_session.py | 231 ++++++++++++++++++++-- + 2 files changed, 360 insertions(+), 35 deletions(-) + + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index 70a1bdd..86a958e 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -8,6 +8,24 @@ if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + +from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + +from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + +from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + +from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + +from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + +from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + +from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + +from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + +from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + +from ..agents.types.get_agents_response import GetAgentsResponse + +from ..agents.types.list_agents_response import ListAgentsResponse + +from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + +from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + +from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + +from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + +from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + +from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + +from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + @@ -46,10 +64,21 @@ from ..agents.types.start_agents_request_properties_filler_words_trigger import + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + +from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + +from ..types.tts import Tts + from .token import generate_convo_ai_token, _validate_expires_in + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + +LlmConfig = StartAgentsRequestPropertiesLlm + +LlmStyle = StartAgentsRequestPropertiesLlmStyle + +SttConfig = StartAgentsRequestPropertiesAsr + +SttVendor = StartAgentsRequestPropertiesAsrVendor + +TtsConfig = Tts + +MllmConfig = StartAgentsRequestPropertiesMllm + +MllmVendor = StartAgentsRequestPropertiesMllmVendor + +AvatarConfig = StartAgentsRequestPropertiesAvatar + +AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + @@ -93,6 +122,18 @@ InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + +AgentConfig = StartAgentsRequestProperties + +AgentConfigUpdate = UpdateAgentsRequestProperties + +SessionInfo = GetAgentsResponse + +SessionListResponse = ListAgentsResponse + +SessionSummary = ListAgentsResponseDataListItem + +ConversationHistory = GetHistoryAgentsResponse + +ConversationTurn = GetHistoryAgentsResponseContentsItem + +ConversationRole = GetHistoryAgentsResponseContentsItemRole + +ConversationTurns = GetTurnsAgentsResponse + +ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + +SpeakPriority = SpeakAgentsRequestPriority + +Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + @@ -116,6 +157,7 @@ FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + +FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + + class Agent: + @@ -183,9 +225,20 @@ class Agent: + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + + sample_rate = vendor.sample_rate + + if ( + + self._avatar_required_sample_rate not in (None, 0) + + and sample_rate is not None + + and sample_rate != self._avatar_required_sample_rate + + ): + + raise ValueError( + + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + + f"but TTS is configured with {sample_rate} Hz. " + + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + - new_agent._tts_sample_rate = vendor.sample_rate + + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + @@ -194,6 +247,9 @@ class Agent: + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + + # Note: avatars are not supported with MLLM. The combination is rejected + + # at ``to_properties`` / ``AgentSession.start`` so callers can still + + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + @@ -202,7 +258,10 @@ class Agent: + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + - advanced_features_model = new_agent._advanced_features.model_copy(update={"enable_mllm": None}) + + advanced_features_model = self._copy_model_update( + + new_agent._advanced_features, + + {"enable_mllm": None}, + + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + @@ -214,6 +273,10 @@ class Agent: + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + + # Note: avatars are not supported with MLLM. The combination is rejected + + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + + # enabled) so callers may still combine the two for testing or for the + + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + @@ -282,7 +345,10 @@ class Agent: + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + - new_agent._advanced_features = new_agent._advanced_features.model_copy(update={"enable_tools": enabled}) + + new_agent._advanced_features = self._copy_model_update( + + new_agent._advanced_features, + + {"enable_tools": enabled}, + + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + @@ -294,6 +360,23 @@ class Agent: + new_agent._parameters = parameters + return new_agent + + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + + """Returns a new Agent with the specified RTC audio scenario.""" + + new_agent = self._clone() + + if new_agent._parameters is None: + + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + + elif isinstance(new_agent._parameters, dict): + + new_agent._parameters = typing.cast( + + SessionParamsInput, + + {**new_agent._parameters, "audio_scenario": audio_scenario}, + + ) + + else: + + new_agent._parameters = self._copy_model_update( + + new_agent._parameters, + + {"audio_scenario": audio_scenario}, + + ) + + return new_agent + + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + @@ -342,6 +425,33 @@ class Agent: + new_agent._filler_words = filler_words + return new_agent + + + @staticmethod + + def _field_value(value: typing.Any, field: str) -> typing.Any: + + if value is None: + + return None + + if isinstance(value, dict): + + return value.get(field) + + return getattr(value, field, None) + + + + @staticmethod + + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + + if hasattr(value, "model_copy"): + + return value.model_copy(update=update) + + if hasattr(value, "copy"): + + return value.copy(update=update) + + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + + data_channel = self._field_value(self._parameters, "data_channel") + + if not enable_rtm or data_channel is not None: + + return self._parameters + + if self._parameters is None: + + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + + if isinstance(self._parameters, dict): + + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + + @property + def name(self) -> typing.Optional[str]: + return self._name + @@ -354,6 +464,10 @@ class Agent: + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + + @property + + def tts_sample_rate(self) -> typing.Optional[int]: + + return self._tts_sample_rate + + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + @@ -536,6 +650,20 @@ class Agent: + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + + # Validate the MLLM + enabled-avatar combination BEFORE generating the + + # RTC token so callers get a clear, actionable error first (matches the + + # TypeScript and Go SDKs' fail-fast contract). + + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + + avatar_enabled = ( + + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + + ) + + if is_mllm_mode and avatar_enabled: + + raise ValueError( + + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + + ) + + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + @@ -553,9 +681,6 @@ class Agent: + **token_kwargs, + ) + + - mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + - is_mllm_mode = bool(mllm_flag or self._mllm is not None) + - + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + @@ -579,11 +704,12 @@ class Agent: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + - if self._parameters is not None: + - if isinstance(self._parameters, dict): + - base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**self._parameters) + + parameters = self._resolved_parameters() + + if parameters is not None: + + if isinstance(parameters, dict): + + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + - base_kwargs["parameters"] = self._parameters + + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + @@ -596,12 +722,10 @@ class Agent: + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + - if self._greeting: + + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + - if self._failure_message: + + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + - if self._max_history is not None: + - mllm_config.setdefault("max_history", self._max_history) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + @@ -617,14 +741,14 @@ class Agent: + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + - if self._instructions: + + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + - if self._greeting: + - llm_config.setdefault("greeting_message", self._greeting) + - if self._failure_message: + - llm_config.setdefault("failure_message", self._failure_message) + + if self._greeting is not None: + + llm_config["greeting_message"] = self._greeting + + if self._failure_message is not None: + + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + - llm_config.setdefault("max_history", self._max_history) + + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index 2408659..e41a399 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -14,13 +14,16 @@ from ..agent_management.types.agent_think_agent_management_request_on_thinking_a + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + +from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + @@ -182,17 +185,29 @@ class _AgentSessionBase: + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + + if self._is_mllm_mode(): + + raise ValueError( + + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + - sample_rate = tts_params.get("sample_rate") if isinstance(tts_params, dict) else None + + sample_rate = self._agent.tts_sample_rate + + if sample_rate is None and isinstance(tts_params, dict): + + sample_rate = ( + + tts_params.get("sample_rate") + + or tts_params.get("sample_rate_hertz") + + or tts_params.get("samplingRate") + + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + @@ -211,6 +226,50 @@ class _AgentSessionBase: + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + + avatar = properties.get("avatar") + + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + + return + + + + params = avatar.get("params") + + if not isinstance(params, dict): + + params = {} + + avatar["params"] = params + + + + if is_generic_avatar(avatar): + + if not params.get("agora_appid"): + + params["agora_appid"] = self._app_id + + if not params.get("agora_channel"): + + params["agora_channel"] = self._channel + + + + if not is_rtc_avatar(avatar): + + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + + return + + + + if not params.get("agora_token"): + + if not self._app_certificate: + + raise ValueError( + + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + + ) + + token_kwargs: typing.Dict[str, typing.Any] = {} + + if self._expires_in is not None: + + token_kwargs["token_expire"] = self._expires_in + + params["agora_token"] = generate_convo_ai_token( + + app_id=self._app_id, + + app_certificate=self._app_certificate, + + channel_name=self._channel, + + account=str(params["agora_uid"]), + + **token_kwargs, + + ) + + + + if str(params.get("agora_uid")) == self._agent_uid: + + self._warn( + + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + + ) + + + + validate_avatar_config(avatar, require_session_fields=True) + + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + @@ -238,12 +297,17 @@ class _AgentSessionBase: + **token_opts, + ) + properties = self._dump_model(base_properties) + + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + - mllm = dict(self._agent.mllm) + - if self._agent.greeting: + + mllm = self._dump_model(self._agent.mllm) + + if not isinstance(mllm, dict): + + mllm = {} + + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + + if self._agent.failure_message is not None: + + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + @@ -251,20 +315,41 @@ class _AgentSessionBase: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + - if self._agent.instructions: + + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + - if self._agent.greeting: + - llm.setdefault("greeting_message", self._agent.greeting) + - if self._agent.failure_message: + - llm.setdefault("failure_message", self._agent.failure_message) + + if self._agent.greeting is not None: + + llm["greeting_message"] = self._agent.greeting + + if self._agent.failure_message is not None: + + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + - llm.setdefault("max_history", self._agent.max_history) + + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + + @staticmethod + + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + + if pagination is None: + + return None + + if isinstance(pagination, dict): + + return pagination.get(field) + + return getattr(pagination, field, None) + + + + @staticmethod + + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + + return list(turns or []) + + + + @classmethod + + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + + data = cls._dump_model(first_response) + + if not isinstance(data, dict): + + data = {} + + data["turns"] = turns + + return GetTurnsAgentsResponse(**data) + + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + @@ -484,7 +569,12 @@ class AgentSession(_AgentSessionBase): + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + - """Inject a custom text instruction into the current session pipeline.""" + + """Inject a custom text instruction into the current session pipeline. + + + + In API v2.7, omitting ``on_listening_action`` uses the server default + + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + + preserve the pre-v2.7 behavior. + + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + @@ -547,15 +637,68 @@ class AgentSession(_AgentSessionBase): + self._app_id, self._agent_id, request_options=self._request_options() + ) + + - def get_turns(self) -> typing.Any: + + def get_turns( + + self, + + *, + + page_index: typing.Optional[int] = None, + + page_size: typing.Optional[int] = None, + + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_turns( + - self._app_id, self._agent_id, request_options=self._request_options() + + self._app_id, + + self._agent_id, + + page_index=page_index, + + page_size=page_size, + + request_options=self._request_options(), + ) + + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + + """Get all turn analytics pages for this session. + + + + Raises ``RuntimeError`` if the server's pagination metadata is missing + + the fields required to advance, or if requesting the next page returns + + a page index that did not advance. + + """ + + response = self.get_turns(page_index=1, page_size=page_size) + + all_turns = self._response_turns(response) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + current_page = self._page_value(pagination, "page_index") or 1 + + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + + total_pages = self._page_value(pagination, "total_pages") + + returned_index = self._page_value(pagination, "page_index") + + if returned_index is None and total_pages is None: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + if total_pages is not None and current_page >= total_pages: + + break + + next_page = current_page + 1 + + response = self.get_turns(page_index=next_page, page_size=page_size) + + all_turns.extend(self._response_turns(response)) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + returned_index = self._page_value(pagination, "page_index") if pagination else None + + if returned_index is not None: + + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + + raise RuntimeError( + + f"get_all_turns pagination did not advance: requested page {next_page}, " + + f"received page {returned_index}." + + ) + + current_page = returned_index + + else: + + total_pages = self._page_value(pagination, "total_pages") if pagination else None + + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + + if total_pages is None and is_last_page is not True: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + current_page = next_page + + return self._with_all_turns(response, all_turns) + + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + @@ -734,7 +877,12 @@ class AsyncAgentSession(_AgentSessionBase): + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + - """Inject a custom text instruction into the current session pipeline.""" + + """Inject a custom text instruction into the current session pipeline. + + + + In API v2.7, omitting ``on_listening_action`` uses the server default + + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + + preserve the pre-v2.7 behavior. + + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + @@ -797,11 +945,64 @@ class AsyncAgentSession(_AgentSessionBase): + self._app_id, self._agent_id, request_options=self._request_options() + ) + + - async def get_turns(self) -> typing.Any: + + async def get_turns( + + self, + + *, + + page_index: typing.Optional[int] = None, + + page_size: typing.Optional[int] = None, + + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_turns( + - self._app_id, self._agent_id, request_options=self._request_options() + + self._app_id, + + self._agent_id, + + page_index=page_index, + + page_size=page_size, + + request_options=self._request_options(), + ) + + + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + + """Get all turn analytics pages for this session. + + + + Raises ``RuntimeError`` if the server's pagination metadata is missing + + the fields required to advance, or if requesting the next page returns + + a page index that did not advance. + + """ + + response = await self.get_turns(page_index=1, page_size=page_size) + + all_turns = self._response_turns(response) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + current_page = self._page_value(pagination, "page_index") or 1 + + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + + total_pages = self._page_value(pagination, "total_pages") + + returned_index = self._page_value(pagination, "page_index") + + if returned_index is None and total_pages is None: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + if total_pages is not None and current_page >= total_pages: + + break + + next_page = current_page + 1 + + response = await self.get_turns(page_index=next_page, page_size=page_size) + + all_turns.extend(self._response_turns(response)) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + returned_index = self._page_value(pagination, "page_index") if pagination else None + + if returned_index is not None: + + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + + raise RuntimeError( + + f"get_all_turns pagination did not advance: requested page {next_page}, " + + f"received page {returned_index}." + + ) + + current_page = returned_index + + else: + + total_pages = self._page_value(pagination, "total_pages") if pagination else None + + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + + if total_pages is None and is_last_page is not True: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + current_page = next_page + + return self._with_all_turns(response, all_turns) + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection + from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode + from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs + from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode + from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from .token import generate_convo_ai_token, _validate_expires_in + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + MllmVendor = StartAgentsRequestPropertiesMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + # LLM sub-type aliases + LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs + LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode + McpServersItem = StartAgentsRequestPropertiesLlmMcpServersItem + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Examples + -------- + >>> from agora_agent.agentkit import Agent + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + ): + self._name = name + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + The failure message is played via TTS when the LLM call fails. + """ + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Returns a new Agent with the specified maximum conversation history length.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + account=agent_uid, + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + if self._stt is not None: + base_kwargs["asr"] = self._stt + + return StartAgentsRequestProperties(**base_kwargs) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + account=self._agent_uid, + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_rtc_avatar(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + account=str(params["agora_uid"]), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + status: unresolved + - id: patch-eaec58eb + content_hash: sha256:8390ced175326080fc76021a97d315e71229bbc9ad70eef35a63eb9968df7830 + original_commit: eaec58eb2edfe03b1311a32dd137a867edf5d096 + original_message: "refactor(agentkit): align deprecated vendor aliases with canonical names" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/vendors/__init__.py + - src/agora_agent/agentkit/vendors/avatar.py + - src/agora_agent/agentkit/vendors/mllm.py + patch_content: |+ + diff --git a/src/agora_agent/agentkit/vendors/__init__.py b/src/agora_agent/agentkit/vendors/__init__.py + index 689eab1..8e2042e 100644 + --- a/src/agora_agent/agentkit/vendors/__init__.py + +++ b/src/agora_agent/agentkit/vendors/__init__.py + @@ -13,7 +13,7 @@ from .base import ( + ) + from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + -from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok, XaiRealtime + +from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok + from .stt import ( + AmazonSTT, + AresSTT, + @@ -83,7 +83,6 @@ __all__ = [ + "GeminiLive", + "VertexAI", + "XaiGrok", + - "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py + index 00cad8f..50bdd08 100644 + --- a/src/agora_agent/agentkit/vendors/avatar.py + +++ b/src/agora_agent/agentkit/vendors/avatar.py + @@ -5,19 +5,19 @@ from pydantic import BaseModel, ConfigDict, Field, field_validator + + from .base import BaseAvatar + + -HEYGEN_SAMPLE_RATE = 24000 + LIVEAVATAR_SAMPLE_RATE = 24000 + +HEYGEN_SAMPLE_RATE = LIVEAVATAR_SAMPLE_RATE + AKOOL_SAMPLE_RATE = 16000 + + + -class HeyGenAvatarOptions(BaseModel): + +class LiveAvatarAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + - api_key: str = Field(..., description="HeyGen API key") + + api_key: str = Field(..., description="LiveAvatar API key") + quality: str = Field(..., description="Avatar quality: low, medium, or high") + agora_uid: str = Field(..., description="Agora UID for the avatar stream") + agora_token: Optional[str] = Field(default=None, description="RTC token for avatar authentication") + - avatar_id: Optional[str] = Field(default=None, description="HeyGen avatar ID") + + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + disable_idle_timeout: Optional[bool] = Field(default=None, description="Whether to disable idle timeout") + activity_idle_timeout: Optional[int] = Field(default=None, description="Idle timeout in seconds") + @@ -31,20 +31,14 @@ class HeyGenAvatarOptions(BaseModel): + raise ValueError(f"Invalid quality '{v}'. Must be one of: {', '.join(valid)}") + return v + + -class HeyGenAvatar(BaseAvatar): + - """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + +class LiveAvatarAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + - warnings.warn( + - "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + - DeprecationWarning, + - stacklevel=2, + - ) + - self.options = HeyGenAvatarOptions(**kwargs) + + self.options = LiveAvatarAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + - return HEYGEN_SAMPLE_RATE + + return LIVEAVATAR_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + @@ -65,71 +59,79 @@ class HeyGenAvatar(BaseAvatar): + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + - return {"enable": enable, "vendor": "heygen", "params": params} + + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + -class AkoolAvatarOptions(BaseModel): + - model_config = ConfigDict(extra="forbid") + +class HeyGenAvatarOptions(LiveAvatarAvatarOptions): + + """Deprecated: use :class:`LiveAvatarAvatarOptions` instead.""" + + - api_key: str = Field(..., description="Akool API key") + - avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + - enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + - additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + -class AkoolAvatar(BaseAvatar): + +class HeyGenAvatar(BaseAvatar): + + """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + + def __init__(self, **kwargs: Any): + - self.options = AkoolAvatarOptions(**kwargs) + + warnings.warn( + + "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + + DeprecationWarning, + + stacklevel=2, + + ) + + self.options = HeyGenAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + - return AKOOL_SAMPLE_RATE + + return HEYGEN_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + + "quality": self.options.quality, + + "agora_uid": self.options.agora_uid, + } + + + if self.options.agora_token is not None: + + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + + if self.options.disable_idle_timeout is not None: + + params["disable_idle_timeout"] = self.options.disable_idle_timeout + + if self.options.activity_idle_timeout is not None: + + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + - return {"enable": enable, "vendor": "akool", "params": params} + + return {"enable": enable, "vendor": "heygen", "params": params} + + + -class LiveAvatarAvatarOptions(HeyGenAvatarOptions): + - pass + +class AkoolAvatarOptions(BaseModel): + + model_config = ConfigDict(extra="forbid") + + + + api_key: str = Field(..., description="Akool API key") + + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + -class LiveAvatarAvatar(BaseAvatar): + +class AkoolAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + - self.options = LiveAvatarAvatarOptions(**kwargs) + + self.options = AkoolAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + - return LIVEAVATAR_SAMPLE_RATE + + return AKOOL_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + - "quality": self.options.quality, + - "agora_uid": self.options.agora_uid, + } + + - if self.options.agora_token is not None: + - params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + - if self.options.disable_idle_timeout is not None: + - params["disable_idle_timeout"] = self.options.disable_idle_timeout + - if self.options.activity_idle_timeout is not None: + - params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + - return {"enable": enable, "vendor": "liveavatar", "params": params} + + return {"enable": enable, "vendor": "akool", "params": params} + + + class GenericAvatarOptions(BaseModel): + @@ -145,6 +147,7 @@ class GenericAvatarOptions(BaseModel): + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + + class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + @@ -178,10 +181,11 @@ class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anam API key") + - persona_id: Optional[str] = Field(default=None, description="Persona ID") + + persona_id: Optional[str] = Field(default=None, description="Anam persona ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + + class AnamAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AnamAvatarOptions(**kwargs) + diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py + index cd6cd07..b58f040 100644 + --- a/src/agora_agent/agentkit/vendors/mllm.py + +++ b/src/agora_agent/agentkit/vendors/mllm.py + @@ -1,4 +1,3 @@ + -import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + @@ -119,30 +118,6 @@ class XaiGrok(BaseMLLM): + return config + + + -class XaiRealtimeOptions(XaiGrokOptions): + - """Deprecated: use :class:`XaiGrokOptions` instead.""" + - + - def __init__(self, **data: Any): + - warnings.warn( + - "XaiRealtimeOptions is deprecated; use XaiGrokOptions instead.", + - DeprecationWarning, + - stacklevel=2, + - ) + - super().__init__(**data) + - + - + -class XaiRealtime(XaiGrok): + - """Deprecated: use :class:`XaiGrok` instead.""" + - + - def __init__(self, **kwargs: Any): + - warnings.warn( + - "XaiRealtime is deprecated; use XaiGrok instead.", + - DeprecationWarning, + - stacklevel=2, + - ) + - super().__init__(**kwargs) + - + - + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + theirs_snapshot: + src/agora_agent/agentkit/vendors/__init__.py: | + from .base import ( + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + ElevenLabsSampleRate, + GoogleTTSSampleRate, + MicrosoftSampleRate, + OpenAISampleRate, + SampleRate, + ) + from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok + from .stt import ( + AmazonSTT, + AresSTT, + AssemblyAISTT, + DeepgramSTT, + GoogleSTT, + MicrosoftSTT, + OpenAISTT, + SarvamSTT, + SpeechmaticsSTT, + ) + from .tts import ( + AmazonTTS, + CartesiaTTS, + DeepgramTTS, + ElevenLabsTTS, + FishAudioTTS, + GoogleTTS, + HumeAITTS, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAITTS, + RimeTTS, + SarvamTTS, + ) + + __all__ = [ + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "GoogleTTSSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + ] + src/agora_agent/agentkit/vendors/avatar.py: | + import warnings + from typing import Any, Dict, Optional + + from pydantic import BaseModel, ConfigDict, Field, field_validator + + from .base import BaseAvatar + + LIVEAVATAR_SAMPLE_RATE = 24000 + HEYGEN_SAMPLE_RATE = LIVEAVATAR_SAMPLE_RATE + AKOOL_SAMPLE_RATE = 16000 + + + class LiveAvatarAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="LiveAvatar API key") + quality: str = Field(..., description="Avatar quality: low, medium, or high") + agora_uid: str = Field(..., description="Agora UID for the avatar stream") + agora_token: Optional[str] = Field(default=None, description="RTC token for avatar authentication") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + disable_idle_timeout: Optional[bool] = Field(default=None, description="Whether to disable idle timeout") + activity_idle_timeout: Optional[int] = Field(default=None, description="Idle timeout in seconds") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + @field_validator("quality") + @classmethod + def validate_quality(cls, v: str) -> str: + valid = ("low", "medium", "high") + if v not in valid: + raise ValueError(f"Invalid quality '{v}'. Must be one of: {', '.join(valid)}") + return v + + + class LiveAvatarAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = LiveAvatarAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return LIVEAVATAR_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + class HeyGenAvatarOptions(LiveAvatarAvatarOptions): + """Deprecated: use :class:`LiveAvatarAvatarOptions` instead.""" + + + class HeyGenAvatar(BaseAvatar): + """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + def __init__(self, **kwargs: Any): + warnings.warn( + "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + DeprecationWarning, + stacklevel=2, + ) + self.options = HeyGenAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return HEYGEN_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "heygen", "params": params} + + + class AkoolAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Akool API key") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + class AkoolAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AkoolAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return AKOOL_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "akool", "params": params} + + + class GenericAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Generic avatar provider API key") + api_base_url: str = Field(..., description="Avatar provider API base URL") + avatar_id: str = Field(..., description="Avatar ID") + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "api_base_url": self.options.api_base_url, + "avatar_id": self.options.avatar_id, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_appid is not None: + params["agora_appid"] = self.options.agora_appid + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.agora_channel is not None: + params["agora_channel"] = self.options.agora_channel + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "generic", "params": params} + + + class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anam API key") + persona_id: Optional[str] = Field(default=None, description="Anam persona ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + class AnamAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AnamAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.persona_id is not None: + params["persona_id"] = self.options.persona_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "anam", "params": params} + src/agora_agent/agentkit/vendors/mllm.py: | + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + + from ...agents.types.start_agents_request_properties_mllm_turn_detection import ( + StartAgentsRequestPropertiesMllmTurnDetection, + ) + from .base import BaseMLLM + + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + + + class OpenAIRealtimeOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="OpenAI API key") + model: Optional[str] = Field(default=None, description="Model name (e.g., gpt-4o-realtime-preview)") + url: Optional[str] = Field(default=None, description="WebSocket URL") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIRealtimeOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config: Dict[str, Any] = { + "vendor": "openai", + "api_key": self.options.api_key, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.model is not None: + params = {"model": self.options.model} + if self.options.params is not None: + params.update(self.options.params) + config["params"] = params + elif self.options.params is not None: + config["params"] = self.options.params + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + # xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + # is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + class XaiGrokOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="xAI API key") + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + class XaiGrok(BaseMLLM): + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + def __init__(self, **kwargs: Any): + self.options = XaiGrokOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.language is not None: + params["language"] = self.options.language + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate + + config: Dict[str, Any] = { + "vendor": "xai", + "api_key": self.options.api_key, + "url": self.options.url, + "params": params, + } + + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + model: str = Field(..., description="Model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud location/region") + adc_credentials_string: str = Field(..., description="Application Default Credentials JSON string") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name (e.g., Aoede, Charon)") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # additional_params spread first so that explicit fields always win, + # matching the TypeScript SDK. + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["model"] = self.options.model + params["project_id"] = self.options.project_id + params["location"] = self.options.location + params["adc_credentials_string"] = self.options.adc_credentials_string + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "vertexai", + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class GeminiLiveOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google API key") + model: str = Field(..., description="Gemini Live model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiLiveOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = {} + if self.options.additional_params is not None: + params.update(self.options.additional_params) + params["model"] = self.options.model + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "gemini", + "api_key": self.options.api_key, + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + status: unresolved + - id: patch-20245632 + content_hash: sha256:a22e4a3b114ba8105c8129ccd6222570dc1f231daf9ac6037a00bcd4e11c425b + original_commit: 20245632afd066efe5a453665b29c5ba0e13e4f8 + original_message: "feat(agentkit): export type aliases and avatar token helpers" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/__init__.py + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + - src/agora_agent/agentkit/avatar_types.py + - src/agora_agent/agentkit/constants.py + patch_content: |+ + From 20245632afd066efe5a453665b29c5ba0e13e4f8 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Thu, 21 May 2026 15:17:27 -0400 + Subject: [PATCH] feat(agentkit): export type aliases and avatar token helpers + + --- + src/agora_agent/agentkit/__init__.py | 49 ++++++++++++++++------- + src/agora_agent/agentkit/agent.py | 22 +++++++++- + src/agora_agent/agentkit/agent_session.py | 8 +++- + src/agora_agent/agentkit/avatar_types.py | 23 +++++++++-- + src/agora_agent/agentkit/constants.py | 10 +++++ + 5 files changed, 90 insertions(+), 22 deletions(-) + + diff --git a/src/agora_agent/agentkit/__init__.py b/src/agora_agent/agentkit/__init__.py + index 5ceda66..e9ab221 100644 + --- a/src/agora_agent/agentkit/__init__.py + +++ b/src/agora_agent/agentkit/__init__.py + @@ -2,6 +2,7 @@ from .agent import ( + Agent, + AgentConfig, + AgentConfigUpdate, + + AsrConfig, + ConversationHistory, + ConversationRole, + ConversationSessionTurn, + @@ -62,23 +63,23 @@ from .agent import ( + SessionListResponse, + SessionSummary, + SpeakPriority, + + ThinkOnListeningAction, + + ThinkOnSpeakingAction, + + ThinkOnThinkingAction, + + ThinkResponse, + ) + -from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + -from ..agent_management.types.agent_think_agent_management_response import ( + - AgentThinkAgentManagementResponse as AgentThinkResponse, + -) + -from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + - AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + -) + -from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + - AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + -) + -from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + - AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + +# Deprecated think type aliases (prefer ThinkOn* names). + +from .agent import ( + + ThinkOnListeningAction as AgentThinkRequestOnListeningAction, + + ThinkOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + + ThinkOnThinkingAction as AgentThinkRequestOnThinkingAction, + + ThinkResponse as AgentThinkResponse, + ) + +from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + @@ -94,6 +95,13 @@ from .constants import ( + GeofenceArea, + GeofenceExcludeArea, + FillerWordsSelectionRule, + + ThinkOnListeningActionIgnore, + + ThinkOnListeningActionInject, + + ThinkOnListeningActionInterrupt, + + ThinkOnSpeakingActionIgnore, + + ThinkOnSpeakingActionInterrupt, + + ThinkOnThinkingActionIgnore, + + ThinkOnThinkingActionInterrupt, + TurnDetectionTypeValues, + ) + from .token import ( + @@ -158,7 +166,6 @@ from .vendors import ( + SpeechmaticsSTT, + VertexAI, + XaiGrok, + - XaiRealtime, + LiveAvatarAvatar, + ) + + @@ -172,6 +179,7 @@ __all__ = [ + "LlmConfig", + "LlmStyle", + "SttConfig", + + "AsrConfig", + "SttVendor", + "TtsConfig", + "MllmConfig", + @@ -230,6 +238,13 @@ __all__ = [ + "GeofenceExcludeArea", + "FillerWordsSelectionRule", + "TurnDetectionTypeValues", + + "ThinkOnListeningActionInject", + + "ThinkOnListeningActionInterrupt", + + "ThinkOnListeningActionIgnore", + + "ThinkOnThinkingActionInterrupt", + + "ThinkOnThinkingActionIgnore", + + "ThinkOnSpeakingActionInterrupt", + + "ThinkOnSpeakingActionIgnore", + # LLM sub-types + "LlmGreetingConfigs", + "LlmGreetingConfigsMode", + @@ -246,10 +261,16 @@ __all__ = [ + "ConversationTurns", + "ConversationSessionTurn", + "SpeakPriority", + + "ThinkResponse", + + "ThinkOnListeningAction", + + "ThinkOnThinkingAction", + + "ThinkOnSpeakingAction", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + "AgentThinkRequestOnSpeakingAction", + + "is_avatar_token_managed", + + "is_rtc_avatar", + "AgentPresets", + "DeepgramPresetModels", + "OpenAIPresetModels", + @@ -303,7 +324,6 @@ __all__ = [ + "GeminiLive", + "VertexAI", + "XaiGrok", + - "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + @@ -314,7 +334,6 @@ __all__ = [ + "is_akool_avatar", + "is_anam_avatar", + "is_generic_avatar", + - "is_rtc_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index 86a958e..14933a2 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -66,13 +66,25 @@ from ..agents.types.start_agents_request_properties_filler_words_content import + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + -from .token import generate_convo_ai_token, _validate_expires_in + +from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + + AgentThinkAgentManagementRequestOnListeningAction, + +) + +from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + + AgentThinkAgentManagementRequestOnThinkingAction, + +) + +from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + + AgentThinkAgentManagementRequestOnSpeakingAction, + +) + +from ..agent_management.types.agent_think_agent_management_response import ( + + AgentThinkAgentManagementResponse, + +) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + +AsrConfig = SttConfig + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + @@ -159,6 +171,14 @@ FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + +# Think type aliases and response + +ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + +ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + +ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + +ThinkResponse = AgentThinkAgentManagementResponse + + + +from .token import generate_convo_ai_token, _validate_expires_in + + + + class Agent: + """A reusable agent definition. + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index e41a399..269619e 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -20,10 +20,10 @@ from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + - is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + @@ -242,7 +242,11 @@ class _AgentSessionBase: + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + - if not is_rtc_avatar(avatar): + + if not is_avatar_token_managed(avatar): + + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + + return + + + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + diff --git a/src/agora_agent/agentkit/avatar_types.py b/src/agora_agent/agentkit/avatar_types.py + index a04809c..aea9da1 100644 + --- a/src/agora_agent/agentkit/avatar_types.py + +++ b/src/agora_agent/agentkit/avatar_types.py + @@ -1,3 +1,4 @@ + +import warnings + import typing + + + @@ -21,11 +22,25 @@ def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "generic" + + + +def is_avatar_token_managed(config: typing.Dict[str, typing.Any]) -> bool: + + """Return True when AgentKit manages the avatar RTC publisher identity.""" + + return ( + + is_heygen_avatar(config) + + or is_live_avatar_avatar(config) + + or is_generic_avatar(config) + + ) + + + + + def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + - params = config.get("params", {}) + - return isinstance(params, dict) and bool(params.get("agora_uid")) and ( + - is_heygen_avatar(config) or is_live_avatar_avatar(config) or is_generic_avatar(config) + + """Deprecated: use :func:`is_avatar_token_managed` for vendor gating.""" + + warnings.warn( + + "is_rtc_avatar is deprecated; use is_avatar_token_managed for vendor gating " + + "and keep agora_uid checks in session enrichment.", + + DeprecationWarning, + + stacklevel=2, + ) + + params = config.get("params", {}) + + return isinstance(params, dict) and bool(params.get("agora_uid")) and is_avatar_token_managed(config) + + + def validate_avatar_config( + @@ -95,7 +110,7 @@ def validate_tts_sample_rate( + """Validates that TTS sample rate is compatible with the avatar vendor. + + Different avatar vendors have specific sample rate requirements: + - - HeyGen: ONLY supports 24,000 Hz + + - HeyGen/LiveAvatar: ONLY supports 24,000 Hz + - Akool: ONLY supports 16,000 Hz + + Parameters + diff --git a/src/agora_agent/agentkit/constants.py b/src/agora_agent/agentkit/constants.py + index f86e4d3..c0a852e 100644 + --- a/src/agora_agent/agentkit/constants.py + +++ b/src/agora_agent/agentkit/constants.py + @@ -58,3 +58,13 @@ class TurnDetectionTypeValues: + AGORA_VAD = "agora_vad" + SERVER_VAD = "server_vad" + SEMANTIC_VAD = "semantic_vad" + + + + + +# Think action value constants (match Fern wire values) + +ThinkOnListeningActionInject = "inject" + +ThinkOnListeningActionInterrupt = "interrupt" + +ThinkOnListeningActionIgnore = "ignore" + +ThinkOnThinkingActionInterrupt = "interrupt" + +ThinkOnThinkingActionIgnore = "ignore" + +ThinkOnSpeakingActionInterrupt = "interrupt" + +ThinkOnSpeakingActionIgnore = "ignore" + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/__init__.py: | + from .agent import ( + Agent, + AgentConfig, + AgentConfigUpdate, + AsrConfig, + ConversationHistory, + ConversationRole, + ConversationSessionTurn, + ConversationTurn, + ConversationTurns, + StartAgentsRequestProperties, + AvatarConfig, + AvatarVendor, + GeofenceConfig, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, + RtcConfig, + SttConfig, + SttVendor, + TtsConfig, + FillerWordsConfig, + FillerWordsTrigger, + FillerWordsTriggerFixedTimeConfig, + FillerWordsContent, + FillerWordsContentStaticConfig, + FillerWordsContentSelectionRule, + TurnDetectionConfig, + TurnDetectionNestedConfig, + StartOfSpeechConfig, + StartOfSpeechMode, + StartOfSpeechVadConfig, + StartOfSpeechKeywordsConfig, + StartOfSpeechDisabledConfig, + StartOfSpeechDisabledConfigStrategy, + EndOfSpeechConfig, + EndOfSpeechMode, + EndOfSpeechVadConfig, + EndOfSpeechSemanticConfig, + TurnDetectionType, + InterruptMode, + Eagerness, + SalConfig, + SalMode, + AdvancedFeatures, + SessionParams, + SessionParamsInput, + SilenceConfig, + SilenceAction, + FarewellConfig, + ParametersDataChannel, + ParametersAudioScenario, + InterruptionConfig, + InterruptionMode, + MllmTurnDetectionConfig, + MllmTurnDetectionMode, + Labels, + LlmGreetingConfigs, + LlmGreetingConfigsMode, + McpServersItem, + SessionInfo, + SessionListResponse, + SessionSummary, + SpeakPriority, + ThinkOnListeningAction, + ThinkOnSpeakingAction, + ThinkOnThinkingAction, + ThinkResponse, + ) + # Deprecated think type aliases (prefer ThinkOn* names). + from .agent import ( + ThinkOnListeningAction as AgentThinkRequestOnListeningAction, + ThinkOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ThinkOnThinkingAction as AgentThinkRequestOnThinkingAction, + ThinkResponse as AgentThinkResponse, + ) + from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .constants import ( + DataChannel, + AudioScenario, + SilenceActionValues, + SalModeValues, + GeofenceArea, + GeofenceExcludeArea, + FillerWordsSelectionRule, + ThinkOnListeningActionIgnore, + ThinkOnListeningActionInject, + ThinkOnListeningActionInterrupt, + ThinkOnSpeakingActionIgnore, + ThinkOnSpeakingActionInterrupt, + ThinkOnThinkingActionIgnore, + ThinkOnThinkingActionInterrupt, + TurnDetectionTypeValues, + ) + from .token import ( + GenerateConvoAITokenOptions, + GenerateTokenOptions, + MAX_EXPIRY_SECONDS, + generate_convo_ai_token, + generate_rtc_token, + expires_in_hours, + expires_in_minutes, + ) + from .presets import ( + AgentPresets, + DeepgramPresetModels, + MiniMaxPresetModels, + OpenAIPresetModels, + OpenAITtsPresetModels, + normalize_preset_input, + ) + from .vendors import ( + AkoolAvatar, + AmazonSTT, + AmazonTTS, + AnamAvatar, + Anthropic, + AresSTT, + AssemblyAISTT, + AzureOpenAI, + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + CartesiaTTS, + DeepgramSTT, + DeepgramTTS, + ElevenLabsSampleRate, + ElevenLabsTTS, + FishAudioTTS, + Gemini, + GeminiLive, + GenericAvatar, + GoogleSTT, + GoogleTTS, + HeyGenAvatar, + HumeAITTS, + MicrosoftSampleRate, + MicrosoftSTT, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAI, + OpenAIRealtime, + OpenAISampleRate, + OpenAISTT, + OpenAITTS, + RimeTTS, + SampleRate, + SarvamSTT, + SarvamTTS, + SpeechmaticsSTT, + VertexAI, + XaiGrok, + LiveAvatarAvatar, + ) + + __all__ = [ + "Agent", + "AgentConfig", + "AgentConfigUpdate", + # Return type of Agent.to_properties() + "StartAgentsRequestProperties", + # Top-level config types + "LlmConfig", + "LlmStyle", + "SttConfig", + "AsrConfig", + "SttVendor", + "TtsConfig", + "MllmConfig", + "MllmVendor", + "AvatarConfig", + "AvatarVendor", + "GeofenceConfig", + "RtcConfig", + "FillerWordsConfig", + "FillerWordsTrigger", + "FillerWordsTriggerFixedTimeConfig", + "FillerWordsContent", + "FillerWordsContentStaticConfig", + "FillerWordsContentSelectionRule", + # Turn detection types + "TurnDetectionConfig", + "TurnDetectionNestedConfig", + "StartOfSpeechConfig", + "StartOfSpeechMode", + "StartOfSpeechVadConfig", + "StartOfSpeechKeywordsConfig", + "StartOfSpeechDisabledConfig", + "StartOfSpeechDisabledConfigStrategy", + "EndOfSpeechConfig", + "EndOfSpeechMode", + "EndOfSpeechVadConfig", + "EndOfSpeechSemanticConfig", + # Deprecated turn detection types + "TurnDetectionType", + "InterruptMode", + "Eagerness", + # SAL types + "SalConfig", + "SalMode", + # Advanced features + "AdvancedFeatures", + # Session parameters types + "SessionParams", + "SessionParamsInput", + "SilenceConfig", + "SilenceAction", + "FarewellConfig", + "ParametersDataChannel", + "ParametersAudioScenario", + "InterruptionConfig", + "InterruptionMode", + "MllmTurnDetectionConfig", + "MllmTurnDetectionMode", + "Labels", + # Type-safe constants + "DataChannel", + "AudioScenario", + "SilenceActionValues", + "SalModeValues", + "GeofenceArea", + "GeofenceExcludeArea", + "FillerWordsSelectionRule", + "TurnDetectionTypeValues", + "ThinkOnListeningActionInject", + "ThinkOnListeningActionInterrupt", + "ThinkOnListeningActionIgnore", + "ThinkOnThinkingActionInterrupt", + "ThinkOnThinkingActionIgnore", + "ThinkOnSpeakingActionInterrupt", + "ThinkOnSpeakingActionIgnore", + # LLM sub-types + "LlmGreetingConfigs", + "LlmGreetingConfigsMode", + "McpServersItem", + "AgentSession", + "AsyncAgentSession", + "AgentSessionOptions", + "SessionInfo", + "SessionListResponse", + "SessionSummary", + "ConversationHistory", + "ConversationTurn", + "ConversationRole", + "ConversationTurns", + "ConversationSessionTurn", + "SpeakPriority", + "ThinkResponse", + "ThinkOnListeningAction", + "ThinkOnThinkingAction", + "ThinkOnSpeakingAction", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + "AgentThinkRequestOnSpeakingAction", + "is_avatar_token_managed", + "is_rtc_avatar", + "AgentPresets", + "DeepgramPresetModels", + "OpenAIPresetModels", + "OpenAITtsPresetModels", + "MiniMaxPresetModels", + "normalize_preset_input", + "generate_rtc_token", + "GenerateTokenOptions", + "generate_convo_ai_token", + "GenerateConvoAITokenOptions", + "MAX_EXPIRY_SECONDS", + "expires_in_hours", + "expires_in_minutes", + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + "is_heygen_avatar", + "is_live_avatar_avatar", + "is_akool_avatar", + "is_anam_avatar", + "is_generic_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection + from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode + from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs + from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode + from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse, + ) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + AsrConfig = SttConfig + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + MllmVendor = StartAgentsRequestPropertiesMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + # LLM sub-type aliases + LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs + LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode + McpServersItem = StartAgentsRequestPropertiesLlmMcpServersItem + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + # Think type aliases and response + ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + ThinkResponse = AgentThinkAgentManagementResponse + + from .token import generate_convo_ai_token, _validate_expires_in + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Examples + -------- + >>> from agora_agent.agentkit import Agent + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + ): + self._name = name + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + The failure message is played via TTS when the LLM call fails. + """ + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Returns a new Agent with the specified maximum conversation history length.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + account=agent_uid, + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + if self._stt is not None: + base_kwargs["asr"] = self._stt + + return StartAgentsRequestProperties(**base_kwargs) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + account=self._agent_uid, + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + account=str(params["agora_uid"]), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + src/agora_agent/agentkit/avatar_types.py: | + import warnings + import typing + + + def is_heygen_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "heygen" + + + def is_live_avatar_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "liveavatar" + + + def is_akool_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "akool" + + + def is_anam_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "anam" + + + def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "generic" + + + def is_avatar_token_managed(config: typing.Dict[str, typing.Any]) -> bool: + """Return True when AgentKit manages the avatar RTC publisher identity.""" + return ( + is_heygen_avatar(config) + or is_live_avatar_avatar(config) + or is_generic_avatar(config) + ) + + + def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + """Deprecated: use :func:`is_avatar_token_managed` for vendor gating.""" + warnings.warn( + "is_rtc_avatar is deprecated; use is_avatar_token_managed for vendor gating " + "and keep agora_uid checks in session enrichment.", + DeprecationWarning, + stacklevel=2, + ) + params = config.get("params", {}) + return isinstance(params, dict) and bool(params.get("agora_uid")) and is_avatar_token_managed(config) + + + def validate_avatar_config( + config: typing.Dict[str, typing.Any], + require_session_fields: bool = False, + ) -> None: + """Validates avatar configuration at runtime. + + Parameters + ---------- + config : dict + The avatar configuration dictionary. + + Raises + ------ + ValueError + If the configuration is invalid. + """ + if is_heygen_avatar(config) or is_live_avatar_avatar(config): + label = "HeyGen" if is_heygen_avatar(config) else "LiveAvatar" + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError(f"{label} avatar requires api_key") + if not params.get("quality"): + raise ValueError(f"{label} avatar requires quality (low, medium, or high)") + if not params.get("agora_uid"): + raise ValueError(f"{label} avatar requires agora_uid") + valid_qualities = ("low", "medium", "high") + if params.get("quality") not in valid_qualities: + raise ValueError( + f"Invalid quality for {label}: {params.get('quality')}. " + f"Must be one of: {', '.join(valid_qualities)}" + ) + if require_session_fields and not params.get("agora_token"): + raise ValueError(f"{label} avatar requires agora_token after session enrichment") + elif is_akool_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Akool avatar requires api_key") + elif is_anam_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Anam avatar requires api_key") + elif is_generic_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Generic avatar requires api_key") + if not params.get("api_base_url"): + raise ValueError("Generic avatar requires api_base_url") + if not params.get("avatar_id"): + raise ValueError("Generic avatar requires avatar_id") + if not params.get("agora_uid"): + raise ValueError("Generic avatar requires agora_uid") + if require_session_fields: + if not params.get("agora_token"): + raise ValueError("Generic avatar requires agora_token after session enrichment") + if not params.get("agora_appid"): + raise ValueError("Generic avatar requires agora_appid after session enrichment") + if not params.get("agora_channel"): + raise ValueError("Generic avatar requires agora_channel after session enrichment") + + + def validate_tts_sample_rate( + avatar_config: typing.Dict[str, typing.Any], + tts_sample_rate: int, + ) -> None: + """Validates that TTS sample rate is compatible with the avatar vendor. + + Different avatar vendors have specific sample rate requirements: + - HeyGen/LiveAvatar: ONLY supports 24,000 Hz + - Akool: ONLY supports 16,000 Hz + + Parameters + ---------- + avatar_config : dict + The avatar configuration dictionary. + tts_sample_rate : int + The sample rate from your TTS configuration (in Hz). + + Raises + ------ + ValueError + If TTS sample rate is incompatible with the avatar vendor. + """ + if is_heygen_avatar(avatar_config) or is_live_avatar_avatar(avatar_config): + if tts_sample_rate != 24000: + label = "HeyGen" if is_heygen_avatar(avatar_config) else "LiveAvatar" + raise ValueError( + f"{label} avatars ONLY support 24,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 24kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/overview" + ) + elif is_akool_avatar(avatar_config): + if tts_sample_rate != 16000: + raise ValueError( + f"Akool avatars ONLY support 16,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 16kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/akool" + ) + src/agora_agent/agentkit/constants.py: | + """ + Type-safe constants for agent configuration values. + Use these instead of raw strings to avoid typos and get IDE autocomplete. + """ + + # Data channel: "rtm" | "datastream" + class DataChannel: + RTM = "rtm" + DATASTREAM = "datastream" + + class AudioScenario: + DEFAULT = "default" + CHORUS = "chorus" + AISERVER = "aiserver" + + + # Silence action when timeout elapses: "speak" | "think" + # (Use for parameters.silence_config.action — avoids shadowing SilenceAction type) + class SilenceActionValues: + SPEAK = "speak" + THINK = "think" + + + # SAL mode: "locking" | "recognition" + # (Use for sal.sal_mode — avoids shadowing SalMode type) + class SalModeValues: + LOCKING = "locking" + RECOGNITION = "recognition" + + + # Geofence area: "GLOBAL" | "NORTH_AMERICA" | "EUROPE" | "ASIA" | "INDIA" | "JAPAN" + class GeofenceArea: + GLOBAL = "GLOBAL" + NORTH_AMERICA = "NORTH_AMERICA" + EUROPE = "EUROPE" + ASIA = "ASIA" + INDIA = "INDIA" + JAPAN = "JAPAN" + + + # Geofence exclude area (when area is GLOBAL) + class GeofenceExcludeArea: + NORTH_AMERICA = "NORTH_AMERICA" + EUROPE = "EUROPE" + ASIA = "ASIA" + INDIA = "INDIA" + JAPAN = "JAPAN" + + + # Filler word selection rule: "shuffle" | "round_robin" + class FillerWordsSelectionRule: + SHUFFLE = "shuffle" + ROUND_ROBIN = "round_robin" + + + # Turn detection type (deprecated; use TurnDetectionNestedConfig.EndOfSpeech instead) + class TurnDetectionTypeValues: + AGORA_VAD = "agora_vad" + SERVER_VAD = "server_vad" + SEMANTIC_VAD = "semantic_vad" + + + # Think action value constants (match Fern wire values) + ThinkOnListeningActionInject = "inject" + ThinkOnListeningActionInterrupt = "interrupt" + ThinkOnListeningActionIgnore = "ignore" + ThinkOnThinkingActionInterrupt = "interrupt" + ThinkOnThinkingActionIgnore = "ignore" + ThinkOnSpeakingActionInterrupt = "interrupt" + ThinkOnSpeakingActionIgnore = "ignore" + status: unresolved + - id: patch-972dd5bd + content_hash: sha256:10f86db20e0b5a3800efce4913b736ff338dee29eb18cb31e89658e0293b848e + original_commit: 972dd5bdafc09b3981ab2ce4e0d02beae165c626 + original_message: updated docs + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/reference/agent.md + patch_content: |+ + From 972dd5bdafc09b3981ab2ce4e0d02beae165c626 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Thu, 21 May 2026 16:13:35 -0400 + Subject: [PATCH] updated docs + + --- + docs/reference/agent.md | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + + diff --git a/docs/reference/agent.md b/docs/reference/agent.md + index 1e88b8b..3163f9c 100644 + --- a/docs/reference/agent.md + +++ b/docs/reference/agent.md + @@ -264,3 +264,18 @@ to_properties( + | `rtc` | `Optional[RtcConfig]` | RTC configuration | + | `filler_words` | `Optional[FillerWordsConfig]` | Filler words configuration | + | `config` | `Dict[str, Any]` | Full configuration dict | + + + +## Type aliases + + + +Public aliases over Fern-generated types: `LlmConfig`, `SttConfig`, `AsrConfig` (= `SttConfig`), `MllmConfig`, `AvatarConfig`, session/conversation types, and think types (`ThinkOnListeningAction`, etc.). + + + +Think value constants: `ThinkOnListeningActionInject`, `ThinkOnListeningActionInterrupt`, `ThinkOnListeningActionIgnore`, `ThinkOnThinkingActionInterrupt`, `ThinkOnThinkingActionIgnore`, `ThinkOnSpeakingActionInterrupt`, `ThinkOnSpeakingActionIgnore`. + + + +## Cross-SDK discovery map + + + +| Concept | Python | TypeScript | Go | + +|---|---|---|---| + +| STT payload alias (wire: `asr`) | `SttConfig` / `AsrConfig` | `SttConfig` / `AsrConfig` | `AsrConfig` / `SttConfig` | + +| xAI MLLM (primary) | `XaiGrok` | `XaiGrok` | `XaiGrok` / `NewXaiGrok` | + +| Avatar token helper | `is_avatar_token_managed` | `isAvatarTokenManaged` | `IsAvatarTokenManaged` | + +| Think inject constant | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | + -- + 2.52.0 + + theirs_snapshot: + docs/reference/agent.md: | + --- + sidebar_position: 2 + title: Agent + description: Full API reference for the Python Agent builder class. + --- + + # Agent Reference + + **Import:** `from agora_agent.agentkit import Agent` or `from agora_agent import Agent` + + ## Constructor + + + ```python + Agent( + name: Optional[str] = None, + instructions: Optional[str] = None, + turn_detection: Optional[TurnDetectionConfig] = None, + interruption: Optional[InterruptionConfig] = None, + sal: Optional[SalConfig] = None, + advanced_features: Optional[Dict[str, Any]] = None, + parameters: Optional[SessionParams] = None, + greeting: Optional[str] = None, + failure_message: Optional[str] = None, + max_history: Optional[int] = None, + geofence: Optional[GeofenceConfig] = None, + labels: Optional[Dict[str, str]] = None, + rtc: Optional[RtcConfig] = None, + filler_words: Optional[FillerWordsConfig] = None, + ) + ``` + + | Parameter | Type | Default | Description | + |---|---|---|---| + | `name` | `Optional[str]` | `None` | Agent name, used as default session name | + | `instructions` | `Optional[str]` | `None` | System prompt for the LLM | + | `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Turn detection configuration | + | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | + | `sal` | `Optional[SalConfig]` | `None` | Speech Activity Level configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | `None` | Advanced features dict (e.g., `{'enable_rtm': True}`) | + | `parameters` | `Optional[SessionParams]` | `None` | Additional session parameters | + | `greeting` | `Optional[str]` | `None` | Auto-spoken greeting when agent joins | + | `failure_message` | `Optional[str]` | `None` | Spoken on error | + | `max_history` | `Optional[int]` | `None` | Max conversation history length | + | `geofence` | `Optional[GeofenceConfig]` | `None` | Regional access restriction | + | `labels` | `Optional[Dict[str, str]]` | `None` | Custom key-value labels (returned in callbacks) | + | `rtc` | `Optional[RtcConfig]` | `None` | RTC media encryption | + | `filler_words` | `Optional[FillerWordsConfig]` | `None` | Filler words while waiting for LLM | + + ## Builder Methods + + All builder methods return a new `Agent` instance (immutable pattern). + + ### `with_llm(vendor: BaseLLM) -> Agent` + + Set the LLM vendor for cascading flow. + + + ```python + from agora_agent.agentkit.vendors import OpenAI + agent = Agent().with_llm(OpenAI(api_key='your-key', model='gpt-4o-mini')) + ``` + + ### `with_tts(vendor: BaseTTS) -> Agent` + + Set the TTS vendor. Records the vendor's `sample_rate` for avatar validation. + + + ```python + from agora_agent.agentkit.vendors import ElevenLabsTTS + agent = Agent().with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id')) + ``` + + ### `with_stt(vendor: BaseSTT) -> Agent` + + Set the STT (ASR) vendor. + + + ```python + from agora_agent.agentkit.vendors import DeepgramSTT + agent = Agent().with_stt(DeepgramSTT(api_key='your-key', language='en-US')) + ``` + + ### `with_mllm(vendor: BaseMLLM) -> Agent` + + Set the MLLM vendor for multimodal flow. Calling `with_mllm()` automatically sets `mllm.enable = True`. MLLM sessions do not require TTS, STT, or LLM vendors. + + + ```python + from agora_agent.agentkit.vendors import OpenAIRealtime + agent = Agent().with_mllm(OpenAIRealtime(api_key='your-key')) + ``` + + ### `with_avatar(vendor: BaseAvatar) -> Agent` + + Set the avatar vendor for the cascading ASR + LLM + TTS pipeline. Avatars are not supported when MLLM is enabled — combining `with_mllm()` and an enabled `with_avatar()` is rejected at `to_properties()` and `AgentSession.start()`. A disabled avatar (`enable=False`) is allowed alongside MLLM. + + Raises `ValueError` if the TTS sample rate does not match the avatar's `required_sample_rate`. + + + ```python + from agora_agent.agentkit.vendors import HeyGenAvatar + agent = agent.with_avatar(HeyGenAvatar(api_key='your-key', quality='medium', agora_uid='2')) + ``` + + **Raises:** `ValueError` — `"Avatar requires TTS sample rate of {required} Hz, but TTS is configured with {actual} Hz. Please update your TTS sample_rate to {required}."` + + ### `with_turn_detection(config: TurnDetectionConfig) -> Agent` + + Override cascading-flow turn detection settings. Use `config.start_of_speech` and `config.end_of_speech` for SOS/EOS detection. Use `with_interruption()` for interruption behavior and MLLM vendor `turn_detection` for MLLM turn detection. + + Pause-state detection is configured under semantic end-of-speech: + + ```python + agent = agent.with_turn_detection({ + "mode": "default", + "config": { + "end_of_speech": { + "mode": "semantic", + "semantic_config": { + "pause_state_enabled": True, + }, + }, + }, + }) + ``` + + ### `with_interruption(config: InterruptionConfig) -> Agent` + + Configure unified interruption behavior using the top-level `interruption` object. Use this for `start_of_speech` and `keywords` interruption modes. + + ### `with_instructions(instructions: str) -> Agent` + + Override the system prompt. + + ### `with_greeting(greeting: str) -> Agent` + + Override the greeting message. + + ### `with_name(name: str) -> Agent` + + Override the agent name. + + ### `with_sal(config: SalConfig) -> Agent` + + Set SAL (Selective Attention Locking) configuration. + + ### `with_advanced_features(features: AdvancedFeatures) -> Agent` + + Set advanced features (e.g. `{'enable_rtm': True}`). + + When `enable_rtm=True`, AgentKit defaults `parameters.data_channel` to `"rtm"` unless you explicitly set another data channel. + + ### `with_tools(enabled: bool = True) -> Agent` + + Enable or disable MCP tool invocation by setting `advanced_features.enable_tools`. + + ### `with_parameters(parameters: SessionParams) -> Agent` + + Set session parameters (silence config, farewell config, data channel, audio scenario, etc.). + + ### `with_audio_scenario(audio_scenario: ParametersAudioScenario) -> Agent` + + Set `parameters.audio_scenario` without replacing existing session parameters. + + ### `with_failure_message(message: str) -> Agent` + + Set the message spoken via TTS when the LLM call fails. + + ### `with_max_history(max_history: int) -> Agent` + + Set the maximum conversation history length for the standard ASR + LLM + TTS pipeline. The v2.7 MLLM core type does not expose `max_history`. + + ### `with_geofence(geofence: GeofenceConfig) -> Agent` + + Set geofence configuration (restricts backend server regions). + + ### `with_labels(labels: Dict[str, str]) -> Agent` + + Set custom labels (key-value pairs returned in notification callbacks). + + ### `with_rtc(rtc: RtcConfig) -> Agent` + + Set RTC configuration. + + ### `with_filler_words(filler_words: FillerWordsConfig) -> Agent` + + Set filler words configuration (played while waiting for LLM response). + + ## `create_session()` + + + ```python + create_session( + client: Any, + channel: str, + agent_uid: str, + remote_uids: List[str], + name: Optional[str] = None, + token: Optional[str] = None, + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + expires_in: Optional[int] = None, + ) -> AgentSession + ``` + + Creates an `AgentSession` bound to the given client and channel. + + | Parameter | Type | Required | Description | + |---|---|---|---| + | `client` | `Agora` or `AsyncAgora` | Yes | Authenticated client | + | `channel` | `str` | Yes | Channel name | + | `agent_uid` | `str` | Yes | UID for the agent | + | `remote_uids` | `List[str]` | Yes | UIDs of remote participants | + | `name` | `Optional[str]` | No | Session name (defaults to agent name) | + | `token` | `Optional[str]` | No | Pre-built RTC+RTM token | + | `expires_in` | `Optional[int]` | No | Token lifetime in seconds (default: `86400` = 24 h, Agora max). Only applies when the token is auto-generated. Use `expires_in_hours()` or `expires_in_minutes()` for clarity. Valid range: 1–86400. | + | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | + | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | + + **Returns:** `AgentSession` + + ## `to_properties()` + + Converts the agent configuration into a `StartAgentsRequestProperties` object for the Agora API. Called internally by `AgentSession.start()`. + + + ```python + to_properties( + channel: str, + agent_uid: str, + remote_uids: List[str], + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + token: Optional[str] = None, + app_id: Optional[str] = None, + app_certificate: Optional[str] = None, + expires_in: Optional[int] = None, + ) -> StartAgentsRequestProperties + ``` + + **Raises:** `ValueError` if neither `token` nor `app_id`+`app_certificate` is provided, or if required vendors (LLM, TTS) are missing in cascading mode. + + ## Properties + + | Property | Type | Description | + |---|---|---| + | `name` | `Optional[str]` | Agent name | + | `instructions` | `Optional[str]` | System prompt | + | `greeting` | `Optional[str]` | Greeting message | + | `failure_message` | `Optional[str]` | Message spoken when LLM fails | + | `max_history` | `Optional[int]` | Max conversation history length | + | `llm` | `Optional[Dict[str, Any]]` | LLM config dict (from `to_config()`) | + | `tts` | `Optional[Dict[str, Any]]` | TTS config dict | + | `stt` | `Optional[Dict[str, Any]]` | STT config dict | + | `mllm` | `Optional[Dict[str, Any]]` | MLLM config dict | + | `avatar` | `Optional[Dict[str, Any]]` | Avatar config dict | + | `turn_detection` | `Optional[TurnDetectionConfig]` | Turn detection settings | + | `sal` | `Optional[SalConfig]` | SAL configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | Advanced features | + | `parameters` | `Optional[SessionParams]` | Session parameters | + | `geofence` | `Optional[GeofenceConfig]` | Geofence configuration | + | `labels` | `Optional[Dict[str, str]]` | Custom labels | + | `rtc` | `Optional[RtcConfig]` | RTC configuration | + | `filler_words` | `Optional[FillerWordsConfig]` | Filler words configuration | + | `config` | `Dict[str, Any]` | Full configuration dict | + + ## Type aliases + + Public aliases over Fern-generated types: `LlmConfig`, `SttConfig`, `AsrConfig` (= `SttConfig`), `MllmConfig`, `AvatarConfig`, session/conversation types, and think types (`ThinkOnListeningAction`, etc.). + + Think value constants: `ThinkOnListeningActionInject`, `ThinkOnListeningActionInterrupt`, `ThinkOnListeningActionIgnore`, `ThinkOnThinkingActionInterrupt`, `ThinkOnThinkingActionIgnore`, `ThinkOnSpeakingActionInterrupt`, `ThinkOnSpeakingActionIgnore`. + + ## Cross-SDK discovery map + + | Concept | Python | TypeScript | Go | + |---|---|---|---| + | STT payload alias (wire: `asr`) | `SttConfig` / `AsrConfig` | `SttConfig` / `AsrConfig` | `AsrConfig` / `SttConfig` | + | xAI MLLM (primary) | `XaiGrok` | `XaiGrok` | `XaiGrok` / `NewXaiGrok` | + | Avatar token helper | `is_avatar_token_managed` | `isAvatarTokenManaged` | `IsAvatarTokenManaged` | + | Think inject constant | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | + status: unresolved + - id: patch-7465fada + content_hash: sha256:9c6ed2e5f48702293eed8b213cc31cce63a7ed5a1ad16a0b23e791c13e77746f + original_commit: 7465fadafa0f1e62051d99b42d0eeda85f31eeee + original_message: "fix(agentkit): resolve Python session typing issues" + original_author: digitallysavvy + base_generation: b66d871314ca0e5929cb9c9095949a7fd5e856a7 + files: + - src/agora_agent/agentkit/agent_session.py + patch_content: | + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index dbff562..dca9ee8 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -24,6 +24,7 @@ from .avatar_types import ( + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + theirs_snapshot: + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent, GetTurnsOptions, SayOptions, ThinkOptions + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import ( + get_preset_category, + infer_asr_preset, + infer_llm_preset, + infer_tts_preset, + normalize_preset_input, + resolve_session_presets, + ) + from .token import generate_convo_ai_token, _parse_numeric_uid + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, preset, + pipeline_id, expires_in, debug, warn + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(self._agent_uid, "agent_uid"), + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(str(params["agora_uid"]), "avatar agora_uid"), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties( + self, + token_opts: typing.Dict[str, typing.Any], + skip_vendor_validation_categories: typing.AbstractSet[str], + allow_missing_vendor_categories: typing.AbstractSet[str], + ) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation_categories=skip_vendor_validation_categories, + allow_missing_vendor_categories=allow_missing_vendor_categories, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.greeting_configs is not None: + llm["greeting_configs"] = self._dump_model(self._agent.greeting_configs) + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + def _vendor_validation_categories( + self, + pipeline_id: typing.Optional[str], + ) -> typing.Tuple[typing.Set[str], typing.Set[str]]: + skip_categories: typing.Set[str] = set() + allow_missing_categories: typing.Set[str] = {"asr", "llm", "tts"} if pipeline_id else set() + + preset = normalize_preset_input(self._preset) + if preset: + for item in preset.split(","): + category = get_preset_category(item) + if category is not None: + skip_categories.add(category) + allow_missing_categories.add(category) + + if infer_asr_preset(self._agent.stt): + skip_categories.add("asr") + if infer_llm_preset(self._agent.llm): + skip_categories.add("llm") + if infer_tts_preset(self._agent.tts): + skip_categories.add("tts") + return skip_categories, allow_missing_categories + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @staticmethod + def _response_pagination(response: typing.Any) -> typing.Any: + if isinstance(response, dict): + return response.get("pagination") + return getattr(response, "pagination", None) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> agent = agent.with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + pipeline_id = self._pipeline_id if self._pipeline_id is not None else self._agent.pipeline_id + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + skip_categories, allow_missing_categories = self._vendor_validation_categories(pipeline_id) + properties = self._build_start_properties( + token_opts, + skip_vendor_validation_categories=skip_categories, + allow_missing_vendor_categories=allow_missing_categories, + ) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> agent = agent.with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + pipeline_id = self._pipeline_id if self._pipeline_id is not None else self._agent.pipeline_id + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + skip_categories, allow_missing_categories = self._vendor_validation_categories(pipeline_id) + properties = self._build_start_properties( + token_opts, + skip_vendor_validation_categories=skip_categories, + allow_missing_vendor_categories=allow_missing_categories, + ) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + - id: patch-d29165c4 + content_hash: sha256:be59d1d3efc435d5e0b83305b2cd39ce3dad4534a4125de18028c137e692e659 + original_commit: d29165c4ddd8296af703a4e9ed848516f563dd1b + original_message: make python compat package publishable + original_author: chenyuguo + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - compat/agora-agent-server-sdk/README.md + - compat/agora-agent-server-sdk/pyproject.toml + - compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + patch_content: |+ + From d29165c4ddd8296af703a4e9ed848516f563dd1b Mon Sep 17 00:00:00 2001 + From: chenyuguo + Date: Wed, 27 May 2026 17:24:50 +0800 + Subject: [PATCH] make python compat package publishable + + --- + compat/agora-agent-server-sdk/README.md | 2 ++ + compat/agora-agent-server-sdk/pyproject.toml | 3 +++ + .../src/agora_agent_server_sdk_compat/__init__.py | 1 + + 3 files changed, 6 insertions(+) + create mode 100644 compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + + diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md + index 1388836..cff3cfe 100644 + --- a/compat/agora-agent-server-sdk/README.md + +++ b/compat/agora-agent-server-sdk/README.md + @@ -9,3 +9,5 @@ pip install agora-agents + ``` + + This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. + + + +It intentionally contains only a minimal compatibility module so the distribution can be built and published cleanly with Poetry. + diff --git a/compat/agora-agent-server-sdk/pyproject.toml b/compat/agora-agent-server-sdk/pyproject.toml + index 8efbe53..ac93128 100644 + --- a/compat/agora-agent-server-sdk/pyproject.toml + +++ b/compat/agora-agent-server-sdk/pyproject.toml + @@ -26,6 +26,9 @@ classifiers = [ + "Topic :: Software Development :: Libraries :: Python Modules", + "Typing :: Typed" + ] + +packages = [ + + { include = "agora_agent_server_sdk_compat", from = "src"} + +] + + [tool.poetry.urls] + Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python' + diff --git a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + new file mode 100644 + index 0000000..55522c6 + --- /dev/null + +++ b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + @@ -0,0 +1 @@ + +"""Compatibility package for the renamed agora-agents distribution.""" + -- + 2.52.0 + + theirs_snapshot: + compat/agora-agent-server-sdk/README.md: | + # agora-agent-server-sdk + + This package has been renamed to `agora-agents`. + + New projects should install: + + ```sh + pip install agora-agents + ``` + + This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. + + It intentionally contains only a minimal compatibility module so the distribution can be built and published cleanly with Poetry. + compat/agora-agent-server-sdk/pyproject.toml: | + [project] + name = "agora-agent-server-sdk" + + [tool.poetry] + name = "agora-agent-server-sdk" + version = "v2.0.0" + description = "Compatibility shim for the renamed agora-agents package." + readme = "README.md" + authors = [] + keywords = [] + + classifiers = [ + "Intended Audience :: Developers", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Operating System :: OS Independent", + "Operating System :: POSIX", + "Operating System :: MacOS", + "Operating System :: POSIX :: Linux", + "Operating System :: Microsoft :: Windows", + "Topic :: Software Development :: Libraries :: Python Modules", + "Typing :: Typed" + ] + packages = [ + { include = "agora_agent_server_sdk_compat", from = "src"} + ] + + [tool.poetry.urls] + Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python' + + [tool.poetry.dependencies] + python = "^3.8" + agora-agents = ">=2.0.0,<3.0.0" + + [build-system] + requires = ["poetry-core"] + build-backend = "poetry.core.masonry.api" + compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py: | + """Compatibility package for the renamed agora-agents distribution.""" + status: unresolved + - id: patch-fae1249a + content_hash: sha256:01bf21f3cc4c784dfcff80a48c9c7bb3123af4327a567b7c990b528e9780e9a2 + original_commit: fae1249a20c53761a2eb5515a1bf92ca666760d1 + original_message: Re-export agora-agents API from legacy PyPI compatibility package The compat distribution delegates to agora_agent via __getattr__ and documents both import paths in its README. + original_author: digitallysavvy + base_generation: b66d871314ca0e5929cb9c9095949a7fd5e856a7 + files: + - compat/agora-agent-server-sdk/README.md + - compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + patch_content: |+ + From fae1249a20c53761a2eb5515a1bf92ca666760d1 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 27 May 2026 16:58:18 -0400 + Subject: [PATCH] Re-export agora-agents API from legacy PyPI compatibility + package The compat distribution delegates to agora_agent via __getattr__ and + documents both import paths in its README. + + --- + compat/agora-agent-server-sdk/README.md | 7 +++++-- + .../src/agora_agent_server_sdk_compat/__init__.py | 14 +++++++++++++- + 2 files changed, 18 insertions(+), 3 deletions(-) + + diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md + index cff3cfe..e43d1d8 100644 + --- a/compat/agora-agent-server-sdk/README.md + +++ b/compat/agora-agent-server-sdk/README.md + @@ -8,6 +8,9 @@ New projects should install: + pip install agora-agents + ``` + + -This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. + +This compatibility package re-exports the public API from `agora-agents` to support existing installs during the migration window. The primary import path remains `agora_agent`; you can also import from `agora_agent_server_sdk_compat`: + + -It intentionally contains only a minimal compatibility module so the distribution can be built and published cleanly with Poetry. + +```python + +from agora_agent import Agora, Area + +from agora_agent_server_sdk_compat import Agora, Area + +``` + diff --git a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + index 55522c6..6283244 100644 + --- a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + +++ b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + @@ -1 +1,13 @@ + -"""Compatibility package for the renamed agora-agents distribution.""" + +"""Compatibility re-exports for the renamed agora-agents package.""" + + + +import agora_agent as _agora_agent + + + +__all__ = getattr(_agora_agent, "__all__", []) + + + + + +def __getattr__(name: str): + + return getattr(_agora_agent, name) + + + + + +def __dir__(): + + return dir(_agora_agent) + -- + 2.52.0 + + theirs_snapshot: + compat/agora-agent-server-sdk/README.md: | + # agora-agent-server-sdk + + This package has been renamed to `agora-agents`. + + New projects should install: + + ```sh + pip install agora-agents + ``` + + This compatibility package re-exports the public API from `agora-agents` to support existing installs during the migration window. The primary import path remains `agora_agent`; you can also import from `agora_agent_server_sdk_compat`: + + ```python + from agora_agent import Agora, Area + from agora_agent_server_sdk_compat import Agora, Area + ``` + compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py: | + """Compatibility re-exports for the renamed agora-agents package.""" + + import agora_agent as _agora_agent + + __all__ = getattr(_agora_agent, "__all__", []) + + + def __getattr__(name: str): + return getattr(_agora_agent, name) + + + def __dir__(): + return dir(_agora_agent) + user_owned: true + - id: patch-fc9d93c3 + content_hash: sha256:93877741bdad745fda5dd549d7c3dd6bc315f4574aabd2defb52c0c795bff011 + original_commit: fc9d93c3026a6109d8a5e8b386418592f8d121c5 + original_message: Document agora-agents PyPI install name and migration notes + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/getting-started/installation.md + patch_content: | + diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md + index c14bdb2..f6f1750 100644 + --- a/docs/getting-started/installation.md + +++ b/docs/getting-started/installation.md + @@ -13,13 +13,13 @@ description: Install the Agora Conversational AI Python SDK. + ## Install with pip + + ```sh + -pip install agora-agent-sdk + +pip install agora-agents + ``` + + ## Install with Poetry + + ```sh + -poetry add agora-agent-sdk + +poetry add agora-agents + ``` + + ## Dependencies + theirs_snapshot: + docs/getting-started/installation.md: | + --- + sidebar_position: 1 + title: Installation + description: Install the Agora Conversational AI Python SDK. + --- + + # Installation + + ## Prerequisites + + - Python >= 3.8 + + ## Install with pip + + ```sh + pip install agora-agents + ``` + + ## Install with Poetry + + ```sh + poetry add agora-agents + ``` + + ## Dependencies + + The following packages are installed automatically: + + | Package | Purpose | + |---|---| + | `httpx` (>= 0.21.2) | HTTP client for sync and async requests | + | `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | + | `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | + + ## Sync vs. Async + + The SDK supports both synchronous and asynchronous usage: + + - **Synchronous** — import `Agora` from `agora_agent` and use blocking method calls + - **Asynchronous** — import `AsyncAgora` and `AsyncAgentSession` from `agora_agent` and use `await` with all API calls + + ```python + # Sync + from agora_agent import Agora, Area + + # Async + from agora_agent import AsyncAgora, AsyncAgentSession, Area + ``` + + Both clients share the same constructor parameters and capabilities. See [Authentication](./authentication.md) for setup details. + status: unresolved + - id: patch-44c21c14 + content_hash: sha256:920a8a5905a3bbb134edb28b007c5c0b1b4b2c1f75753140fef305b14a64e3e0 + original_commit: 44c21c14a14aa7ad469a18ce86024ff14ca2bf9b + original_message: Re-export AgentKit symbols from agora_agent package root Extend __getattr__ and __all__ so vendor classes, presets, and helpers are importable via `from agora_agent import ...`. Add tests and update class docstring examples to use the root import path. + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + - tests/custom/test_root_exports.py + patch_content: | + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index a820291..f84862c 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -231,8 +231,7 @@ class Agent: + + Examples + -------- + - >>> from agora_agent.agentkit import Agent + - >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT + + >>> from agora_agent import Agent, OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index fb8e548..a749d1e 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -412,12 +412,10 @@ class AgentSession(_AgentSessionBase): + + Examples + -------- + - >>> from agora_agent import Agora, Area + - >>> from agora_agent.agentkit import Agent + + >>> from agora_agent import Agora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + - >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + @@ -735,12 +733,10 @@ class AsyncAgentSession(_AgentSessionBase): + + Examples + -------- + - >>> from agora_agent import AsyncAgora, Area + - >>> from agora_agent.agentkit import Agent + + >>> from agora_agent import AsyncAgora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + - >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + diff --git a/tests/custom/test_root_exports.py b/tests/custom/test_root_exports.py + new file mode 100644 + index 0000000..9b2f508 + --- /dev/null + +++ b/tests/custom/test_root_exports.py + @@ -0,0 +1,29 @@ + +import pytest + + + +import agora_agent + +import agora_agent.agentkit as agentkit + + + + + +def test_root_exports_match_agentkit_for_common_symbols() -> None: + + for name in ("Agent", "DeepgramSTT", "OpenAI", "AgentPresets", "generate_rtc_token", "DataChannel"): + + assert getattr(agora_agent, name) is getattr(agentkit, name) + + + + + +def test_root_exports_fern_client_symbols() -> None: + + assert agora_agent.Agora is not None + + assert agora_agent.Area is not None + + assert agora_agent.AsyncAgora is not None + + + + + +def test_unknown_root_export_raises_attribute_error() -> None: + + with pytest.raises(AttributeError): + + _ = agora_agent.NotARealExportName + + + + + +def test_dir_includes_agentkit_vendor_exports() -> None: + + assert "DeepgramSTT" in dir(agora_agent) + + + + + +def test_all_includes_agentkit_vendor_exports() -> None: + + assert "DeepgramSTT" in agora_agent.__all__ + + assert "OpenAI" in agora_agent.__all__ + theirs_snapshot: + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.list_agents_response_data_list_item_status import ListAgentsResponseDataListItemStatus + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection + from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode + from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs + from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode + from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse, + ) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + AsrConfig = SttConfig + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + MllmVendor = StartAgentsRequestPropertiesMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + SessionStatus = ListAgentsResponseDataListItemStatus + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + + class ThinkOptions(typing_extensions.TypedDict, total=False): + on_listening_action: AgentThinkAgentManagementRequestOnListeningAction + on_thinking_action: AgentThinkAgentManagementRequestOnThinkingAction + on_speaking_action: AgentThinkAgentManagementRequestOnSpeakingAction + interruptable: bool + metadata: typing.Dict[str, str] + + + class GetTurnsOptions(typing_extensions.TypedDict, total=False): + page_index: int + page_size: int + + + class SayOptions(typing_extensions.TypedDict, total=False): + priority: SpeakAgentsRequestPriority + interruptable: bool + + + class SessionOptions(typing_extensions.TypedDict, total=False): + name: str + channel: str + token: str + agent_uid: str + remote_uids: typing.List[str] + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + # LLM sub-type aliases + LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs + LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode + McpServersItem = StartAgentsRequestPropertiesLlmMcpServersItem + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + # Think type aliases and response + ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + ThinkResponse = AgentThinkAgentManagementResponse + + from .token import generate_convo_ai_token, _parse_numeric_uid, _validate_expires_in + + + def _dump_optional_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Examples + -------- + >>> from agora_agent import Agent, OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + greeting_configs: typing.Optional[LlmGreetingConfigs] = None, + ): + self._name = name + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + self._greeting_configs = greeting_configs + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_greeting_configs(self, configs: LlmGreetingConfigs) -> "Agent": + """Returns a new Agent with greeting playback configuration.""" + new_agent = self._clone() + new_agent._greeting_configs = configs + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + The failure message is played via TTS when the LLM call fails. + """ + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Returns a new Agent with the specified maximum conversation history length.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def greeting_configs(self) -> typing.Optional[LlmGreetingConfigs]: + return self._greeting_configs + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + "greeting_configs": self._greeting_configs, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + uid=_parse_numeric_uid(agent_uid, "agent_uid"), + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._greeting_configs is not None: + llm_config["greeting_configs"] = _dump_optional_model(self._greeting_configs) + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + if self._stt is not None: + base_kwargs["asr"] = self._stt + + return StartAgentsRequestProperties(**base_kwargs) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + new_agent._greeting_configs = self._greeting_configs + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent, GetTurnsOptions, SayOptions, ThinkOptions + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token, _parse_numeric_uid + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(self._agent_uid, "agent_uid"), + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(str(params["agora_uid"]), "avatar agora_uid"), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.greeting_configs is not None: + llm["greeting_configs"] = self._dump_model(self._agent.greeting_configs) + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @staticmethod + def _response_pagination(response: typing.Any) -> typing.Any: + if isinstance(response, dict): + return response.get("pagination") + return getattr(response, "pagination", None) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + tests/custom/test_root_exports.py: | + import pytest + + import agora_agent + import agora_agent.agentkit as agentkit + + + def test_root_exports_match_agentkit_for_common_symbols() -> None: + for name in ("Agent", "DeepgramSTT", "OpenAI", "AgentPresets", "generate_rtc_token", "DataChannel"): + assert getattr(agora_agent, name) is getattr(agentkit, name) + + + def test_root_exports_fern_client_symbols() -> None: + assert agora_agent.Agora is not None + assert agora_agent.Area is not None + assert agora_agent.AsyncAgora is not None + + + def test_unknown_root_export_raises_attribute_error() -> None: + with pytest.raises(AttributeError): + _ = agora_agent.NotARealExportName + + + def test_dir_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in dir(agora_agent) + + + def test_all_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in agora_agent.__all__ + assert "OpenAI" in agora_agent.__all__ + status: unresolved + - id: patch-d475306b + content_hash: sha256:407af5e7564d6e8d0b91f1e117cb433aec931f083225af53c6df2abfff281b22 + original_commit: d475306bd42279984bcf4934b900003e8e02c4eb + original_message: Move package rename guidance to installation docs and protect manual paths in Fern ignore. Consolidate migration notes into the installation guide with next-step links, add a brief README pointer, and exclude README, compat, and workflow files from Fern generation. + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - compat/agora-agent-server-sdk/README.md + - docs/getting-started/installation.md + patch_content: | + diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md + index e43d1d8..1da36aa 100644 + --- a/compat/agora-agent-server-sdk/README.md + +++ b/compat/agora-agent-server-sdk/README.md + @@ -14,3 +14,5 @@ This compatibility package re-exports the public API from `agora-agents` to supp + from agora_agent import Agora, Area + from agora_agent_server_sdk_compat import Agora, Area + ``` + + + +Maintainers: dual-publish steps live in the repository release workflow, not in the root README. + diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md + index 04b48da..8fca9ab 100644 + --- a/docs/getting-started/installation.md + +++ b/docs/getting-started/installation.md + @@ -53,4 +53,15 @@ from agora_agent import AsyncAgora, AsyncAgentSession, Area + | `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | + | `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | + + -See [Authentication](./authentication.md) for setup details. + +## Next steps + + + +- [Authentication](./authentication.md) — configure your credentials + +- [Quick Start](./quick-start.md) — build your first conversational agent + + + +## Migrating from a previous package name + + + +The PyPI distribution was renamed from `agora-agent-server-sdk` to `agora-agents` in v2.0.0. Install `agora-agents`; the import path remains `agora_agent`. + + + +The legacy PyPI name remains available as a compatibility shim that re-exports `agora-agents`. See [compat/agora-agent-server-sdk](../../compat/agora-agent-server-sdk/README.md). + + + +For release and version details, see [changelog — Migration notes](../../changelog.md#migration-notes). + theirs_snapshot: + compat/agora-agent-server-sdk/README.md: | + # agora-agent-server-sdk + + This package has been renamed to `agora-agents`. + + New projects should install: + + ```sh + pip install agora-agents + ``` + + This compatibility package re-exports the public API from `agora-agents` to support existing installs during the migration window. The primary import path remains `agora_agent`; you can also import from `agora_agent_server_sdk_compat`: + + ```python + from agora_agent import Agora, Area + from agora_agent_server_sdk_compat import Agora, Area + ``` + + Maintainers: dual-publish steps live in the repository release workflow, not in the root README. + docs/getting-started/installation.md: | + --- + sidebar_position: 1 + title: Installation + description: Install the Agora Conversational AI Python SDK. + --- + + # Installation + + ## Prerequisites + + - Python >= 3.8 + + ## Install with pip + + ```sh + pip install agora-agents + ``` + + ## Install with Poetry + + ```sh + poetry add agora-agents + ``` + + ## Imports + + ```python + from agora_agent import Agent, Agora, Area, DeepgramSTT, OpenAI + ``` + + The package installs as `agora-agents` and imports as `agora_agent`. + + ## Sync vs. Async + + The SDK supports both synchronous and asynchronous usage: + + - **Synchronous** — import `Agora` from `agora_agent` and use blocking method calls + - **Asynchronous** — import `AsyncAgora` and `AsyncAgentSession` from `agora_agent` and use `await` with all API calls + + ```python + # Sync + from agora_agent import Agora, Area + + # Async + from agora_agent import AsyncAgora, AsyncAgentSession, Area + ``` + + ## Dependencies + + | Package | Purpose | + | ------------------------------ | ------------------------------------------------------ | + | `httpx` (>= 0.21.2) | HTTP client for sync and async requests | + | `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | + | `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | + + ## Next steps + + - [Authentication](./authentication.md) — configure your credentials + - [Quick Start](./quick-start.md) — build your first conversational agent + + ## Migrating from a previous package name + + The PyPI distribution was renamed from `agora-agent-server-sdk` to `agora-agents` in v2.0.0. Install `agora-agents`; the import path remains `agora_agent`. + + The legacy PyPI name remains available as a compatibility shim that re-exports `agora-agents`. See [compat/agora-agent-server-sdk](../../compat/agora-agent-server-sdk/README.md). + + For release and version details, see [changelog — Migration notes](../../changelog.md#migration-notes). + status: unresolved + - id: patch-c9355576 + content_hash: sha256:83b3b6148b21f2b4d53ee67321777522f5f4e871b61ea3b23f3a6b88ca052769 + original_commit: c93555763ffd63267a737b3e430217a890f203db + original_message: Streamline Python docs and README for app-credentials-first onboarding. Remove duplicated low-level client examples from the README, de-emphasize legacy auth modes, refocus the low-level API guide on AgentKit with telephony escape hatches, and update Agora-managed model terminology. + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/getting-started/authentication.md + - docs/guides/low-level-api.md + patch_content: | + diff --git a/docs/getting-started/authentication.md b/docs/getting-started/authentication.md + index 31dcc56..74c62cd 100644 + --- a/docs/getting-started/authentication.md + +++ b/docs/getting-started/authentication.md + @@ -46,41 +46,6 @@ session = agent.create_session( + print(client.auth_mode) # "app-credentials" + ``` + + -## Other auth modes + +## Legacy auth modes + + -The SDK also supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. These are not recommended for new applications. + - + -### Token auth (`auth_token`) + - + -Pass a pre-minted Agora REST token on the client. You must also supply the RTC join token on `create_session(..., token=...)`. + - + -```python + -client = Agora( + - area=Area.US, + - app_id="your-app-id", + - app_certificate="your-app-certificate", + - auth_token="your-rest-auth-token", + -) + - + -session = agent.create_session( + - client, + - channel="room-123", + - agent_uid="1", + - remote_uids=["100"], + - token="your-rtc-join-token", + -) + -``` + - + -### Basic Auth (`customer_id` + `customer_secret`) + - + -Uses HTTP Basic Auth with Customer ID and Secret from Agora Console. Avoid for new integrations — the same credentials are sent on every request instead of minting fresh tokens. + - + -```python + -client = Agora( + - area=Area.US, + - app_id="your-app-id", + - app_certificate="your-app-certificate", + - customer_id="your-customer-id", + - customer_secret="your-customer-secret", + -) + -``` + +The generated client still supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. Do not use those modes for new session integrations. Use app credentials so AgentKit can mint short-lived ConvoAI REST auth and RTC join tokens for each session. + diff --git a/docs/guides/low-level-api.md b/docs/guides/low-level-api.md + index 6677b45..47397b7 100644 + --- a/docs/guides/low-level-api.md + +++ b/docs/guides/low-level-api.md + @@ -1,187 +1,55 @@ + --- + sidebar_position: 10 + title: Low-Level API + -description: Direct client.agents.start() usage without the builder pattern. + +description: Use generated clients for escape-hatch APIs while keeping agent sessions on AgentKit. + --- + + # Low-Level API + + -For full control over request payloads you can call the generated clients directly and pass raw types such as `StartAgentsRequestProperties`, `Tts_Elevenlabs`, and `StartAgentsRequestPropertiesAsr`. Use this when you need vendor or options not exposed by the agentkit, or when integrating with generated types from the API spec. + +Use the `Agent` builder and `AgentSession` for conversational agent starts. That path generates ConvoAI REST auth and RTC join tokens from `app_id` and `app_certificate`, so application code does not need prebuilt REST tokens, RTC tokens, Customer ID, or Customer Secret. + + -## Raw telephony and phone-number APIs + - + -AgentKit focuses on realtime agent session helpers. Telephony call status, call hangup, and phone-number management are exposed through the generated low-level clients: + - + -- `client.telephony` for call status and hangup operations + -- `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + +Generated clients are still available for API surface that AgentKit does not wrap yet, such as telephony and phone-number management. + + -## Cascading flow (ASR → LLM → TTS) + +## Client setup + + ```python + from agora_agent import Agora, Area + -from agora_agent.agents import ( + - StartAgentsRequestProperties, + - StartAgentsRequestPropertiesAsr, + - StartAgentsRequestPropertiesLlm, + -) + -from agora_agent.types.eleven_labs_tts_params import ElevenLabsTtsParams + -from agora_agent.types.tts import Tts_Elevenlabs + + client = Agora( + area=Area.US, + - app_id="YOUR_APP_ID", + - app_certificate="YOUR_APP_CERTIFICATE", + - auth_token="your-rest-auth-token", + -) + -client.agents.start( + - client.app_id, + - name="unique_name", + - properties=StartAgentsRequestProperties( + - channel="channel_name", + - token="token", + - agent_rtc_uid="1001", + - remote_rtc_uids=["1002"], + - idle_timeout=120, + - asr=StartAgentsRequestPropertiesAsr( + - language="en-US", + - vendor="deepgram", + - params={"api_key": "YOUR_DEEPGRAM_API_KEY"}, + - ), + - tts=Tts_Elevenlabs( + - params=ElevenLabsTtsParams( + - key="YOUR_ELEVENLABS_API_KEY", + - model_id="eleven_flash_v2_5", + - voice_id="pNInz6obpgDQGcFmaJgB", + - sample_rate=24000, + - ), + - ), + - llm=StartAgentsRequestPropertiesLlm( + - url="https://api.openai.com/v1/chat/completions", + - api_key="", + - system_messages=[ + - {"role": "system", "content": "You are a helpful chatbot."} + - ], + - params={"model": "gpt-4o-mini"}, + - max_history=32, + - greeting_message="Hello, how can I assist you today?", + - failure_message="Please hold on a second.", + - ), + - ), + + app_id="your-app-id", + + app_certificate="your-app-certificate", + ) + ``` + + -## Async (low-level) + +## Raw telephony and phone-number APIs + + -```python + -import asyncio + -from agora_agent import Area, AsyncAgora + -from agora_agent.agents import ( + - StartAgentsRequestProperties, + - StartAgentsRequestPropertiesAsr, + - StartAgentsRequestPropertiesLlm, + -) + -from agora_agent.types.eleven_labs_tts_params import ElevenLabsTtsParams + -from agora_agent.types.tts import Tts_Elevenlabs + +AgentKit focuses on realtime agent session helpers. Use generated clients for operational APIs: + + -client = AsyncAgora( + - area=Area.US, + - app_id="YOUR_APP_ID", + - app_certificate="YOUR_APP_CERTIFICATE", + - auth_token="your-rest-auth-token", + +- `client.telephony` for call status and hangup operations + +- `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + + + +```python + +calls = client.telephony.list( + + appid=client.app_id, + + type="sip", + ) + + -async def main() -> None: + - await client.agents.start( + - client.app_id, + - name="unique_name", + - properties=StartAgentsRequestProperties( + - channel="channel_name", + - token="token", + - agent_rtc_uid="1001", + - remote_rtc_uids=["1002"], + - idle_timeout=120, + - asr=StartAgentsRequestPropertiesAsr( + - language="en-US", + - vendor="deepgram", + - params={"api_key": "YOUR_DEEPGRAM_API_KEY"}, + - ), + - tts=Tts_Elevenlabs( + - params=ElevenLabsTtsParams( + - key="YOUR_ELEVENLABS_API_KEY", + - model_id="eleven_flash_v2_5", + - voice_id="pNInz6obpgDQGcFmaJgB", + - sample_rate=24000, + - ), + - ), + - llm=StartAgentsRequestPropertiesLlm( + - url="https://api.openai.com/v1/chat/completions", + - api_key="", + - system_messages=[ + - {"role": "system", "content": "You are a helpful chatbot."} + - ], + - params={"model": "gpt-4o-mini"}, + - max_history=32, + - greeting_message="Hello, how can I assist you today?", + - failure_message="Please hold on a second.", + - ), + - ), + - ) + - + -asyncio.run(main()) + +for call in calls: + + print(call.id, call.state) + ``` + + -## MLLM flow (multimodal) + +## Direct agent APIs + + -For real-time audio with OpenAI Realtime or Google Gemini Live, use the MLLM flow instead of the cascading ASR → LLM → TTS flow. See the [MLLM Overview](https://docs.agora.io/en/conversational-ai/models/mllm/overview). + +`client.agents` exposes the generated REST surface for advanced integrations. Prefer `agent.create_session(...).start()` for new session starts because it handles auth, token generation, vendor serialization, lifecycle state, and avatar enrichment. + + -```python + -from agora_agent import Agora, Area + -from agora_agent.agents import ( + - StartAgentsRequestProperties, + - StartAgentsRequestPropertiesMllm, + - StartAgentsRequestPropertiesMllmVendor, + - StartAgentsRequestPropertiesTts, + - StartAgentsRequestPropertiesTtsVendor, + - StartAgentsRequestPropertiesLlm, + -) + +If you need an endpoint that is not wrapped by `AgentSession`, use `session.raw` after creating the session: + + -client = Agora( + - area=Area.US, + - app_id="YOUR_APP_ID", + - app_certificate="YOUR_APP_CERTIFICATE", + - auth_token="your-rest-auth-token", + -) + - + -client.agents.start( + - client.app_id, + - name="mllm_agent", + - properties=StartAgentsRequestProperties( + - channel="channel_name", + - token="your_token", + - agent_rtc_uid="1001", + - remote_rtc_uids=["1002"], + - idle_timeout=120, + - mllm=StartAgentsRequestPropertiesMllm( + - enable=True, + - url="wss://api.openai.com/v1/realtime", + - api_key="", + - vendor=StartAgentsRequestPropertiesMllmVendor.OPENAI, + - params={ + - "model": "gpt-4o-realtime-preview", + - "voice": "alloy", + - }, + - input_modalities=["audio"], + - output_modalities=["text", "audio"], + - greeting_message="Hello! I'm ready to chat in real-time.", + - turn_detection={ + - "mode": "server_vad", + - "server_vad_config": { + - "idle_timeout_ms": 5000, + - }, + - }, + - ), + - ), + +```python + +info = session.raw.get( + + appid=session.app_id, + + agent_id=session.id, + ) + ``` + + -For more on the agentkit-based MLLM flow, see [MLLM Flow](./mllm-flow.md). + +You must pass `appid` and `agent_id` manually when using generated raw methods. + theirs_snapshot: + docs/getting-started/authentication.md: | + --- + sidebar_position: 2 + title: Authentication + description: Configure the Python SDK with app credentials and understand other supported auth modes. + --- + + # Authentication + + Create `Agora` or `AsyncAgora` with `app_id` and `app_certificate` only. The SDK mints a fresh ConvoAI REST token for each API call and generates the RTC join token when the session starts. + + ## App credentials + + ```python + from agora_agent import Agent, Agora, Area, DeepgramSTT, OpenAI, MiniMaxTTS + + client = Agora( + area=Area.US, + app_id="your-app-id", + app_certificate="your-app-certificate", + ) + + agent = ( + Agent(instructions="Be concise.") + .with_stt(DeepgramSTT(model="nova-3")) + .with_llm(OpenAI(model="gpt-4o-mini")) + .with_tts(MiniMaxTTS(model="speech_2_6_turbo", voice_id="English_captivating_female1")) + ) + + session = agent.create_session( + client, + channel="room-123", + agent_uid="1", + remote_uids=["100"], + ) + ``` + + ## Why app credentials + + - Fresh short-lived tokens per API call instead of reusing long-lived credentials + - No Customer ID / Customer Secret in request headers + - No manual REST or RTC token provisioning in application code + + ## Inspecting auth mode + + ```python + print(client.auth_mode) # "app-credentials" + ``` + + ## Legacy auth modes + + The generated client still supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. Do not use those modes for new session integrations. Use app credentials so AgentKit can mint short-lived ConvoAI REST auth and RTC join tokens for each session. + docs/guides/low-level-api.md: | + --- + sidebar_position: 10 + title: Low-Level API + description: Use generated clients for escape-hatch APIs while keeping agent sessions on AgentKit. + --- + + # Low-Level API + + Use the `Agent` builder and `AgentSession` for conversational agent starts. That path generates ConvoAI REST auth and RTC join tokens from `app_id` and `app_certificate`, so application code does not need prebuilt REST tokens, RTC tokens, Customer ID, or Customer Secret. + + Generated clients are still available for API surface that AgentKit does not wrap yet, such as telephony and phone-number management. + + ## Client setup + + ```python + from agora_agent import Agora, Area + + client = Agora( + area=Area.US, + app_id="your-app-id", + app_certificate="your-app-certificate", + ) + ``` + + ## Raw telephony and phone-number APIs + + AgentKit focuses on realtime agent session helpers. Use generated clients for operational APIs: + + - `client.telephony` for call status and hangup operations + - `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + + ```python + calls = client.telephony.list( + appid=client.app_id, + type="sip", + ) + + for call in calls: + print(call.id, call.state) + ``` + + ## Direct agent APIs + + `client.agents` exposes the generated REST surface for advanced integrations. Prefer `agent.create_session(...).start()` for new session starts because it handles auth, token generation, vendor serialization, lifecycle state, and avatar enrichment. + + If you need an endpoint that is not wrapped by `AgentSession`, use `session.raw` after creating the session: + + ```python + info = session.raw.get( + appid=session.app_id, + agent_id=session.id, + ) + ``` + + You must pass `appid` and `agent_id` manually when using generated raw methods. + status: unresolved + - id: patch-299e4bd9 + content_hash: sha256:e1470176436d28416d0ff67d8acc614060fae7b312f86c09b899a92d1c4adfe4 + original_commit: 299e4bd9cb59bd6144084332a7c3fa7bf260769f + original_message: "fix(agentkit): resolve provider config type checks" + original_author: digitallysavvy + base_generation: b66d871314ca0e5929cb9c9095949a7fd5e856a7 + files: + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/vendors/llm.py + - src/agora_agent/agentkit/vendors/mllm.py + - src/agora_agent/agentkit/vendors/stt.py + patch_content: |+ + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index 6275f04..ecf01c6 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -57,6 +57,8 @@ from ..agents.types.start_agents_request_properties_filler_words_content import + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + +from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + +from ..types.tts import Tts + from ..types.asr import Asr + from ..types.llm import Llm + from ..types.llm_style import LlmStyle as GeneratedLlmStyle + @@ -536,6 +538,23 @@ class Agent: + ) + return new_agent + + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + + """Returns a new Agent with the specified RTC audio scenario.""" + + new_agent = self._clone() + + if new_agent._parameters is None: + + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + + elif isinstance(new_agent._parameters, dict): + + new_agent._parameters = typing.cast( + + SessionParamsInput, + + {**new_agent._parameters, "audio_scenario": audio_scenario}, + + ) + + else: + + new_agent._parameters = self._copy_model_update( + + new_agent._parameters, + + {"audio_scenario": audio_scenario}, + + ) + + return new_agent + + + def with_failure_message(self, message: str) -> "Agent": + """Deprecated. Configure the failure message on the LLM or MLLM vendor instead.""" + new_agent = self._clone() + diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py + index 9156a01..5dd822d 100644 + --- a/src/agora_agent/agentkit/vendors/llm.py + +++ b/src/agora_agent/agentkit/vendors/llm.py + @@ -1,7 +1,10 @@ + -from typing import Any, Dict, List, Optional + +from typing import Any, Dict, List, Optional, Union + + from pydantic import BaseModel, ConfigDict, Field, model_validator + + +from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + + StartAgentsRequestPropertiesLlmGreetingConfigs, + +) + from .base import BaseLLM + + LlmGreetingConfigs = Dict[str, Any] + diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py + index 236a494..6a260d8 100644 + --- a/src/agora_agent/agentkit/vendors/mllm.py + +++ b/src/agora_agent/agentkit/vendors/mllm.py + @@ -1,3 +1,4 @@ + +import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + diff --git a/src/agora_agent/agentkit/vendors/stt.py b/src/agora_agent/agentkit/vendors/stt.py + index e5117b0..bb222a9 100644 + --- a/src/agora_agent/agentkit/vendors/stt.py + +++ b/src/agora_agent/agentkit/vendors/stt.py + @@ -89,6 +89,7 @@ class SpeechmaticsSTTOptions(BaseModel): + + api_key: str = Field(..., description="Speechmatics API key") + language: str = Field(..., description="Language code (e.g., en, es, fr)") + + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") + model: Optional[str] = Field(default=None, description="Model name") + uri: Optional[str] = Field(default=None, description="Speechmatics streaming WebSocket URL") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + @@ -124,6 +125,7 @@ class DeepgramSTTOptions(BaseModel): + api_key: Optional[str] = Field(default=None, description="Deepgram API key") + model: Optional[str] = Field(default=None, description="Model (e.g., nova-2, enhanced, base)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en-US)") + + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") + smart_format: Optional[bool] = Field(default=None, description="Enable smart formatting") + punctuation: Optional[bool] = Field(default=None, description="Enable punctuation") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + @@ -353,6 +355,7 @@ class SarvamSTTOptions(BaseModel): + + api_key: str = Field(..., description="Sarvam API key") + language: str = Field(..., description="Language code (e.g., en, hi, ta)") + + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") + model: Optional[str] = Field(default=None, description="Model name") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + theirs_snapshot: + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + import warnings + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.list_agents_response_data_list_item_status import ListAgentsResponseDataListItemStatus + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..types.asr import Asr + from ..types.llm import Llm + from ..types.llm_style import LlmStyle as GeneratedLlmStyle + from ..types.mllm import Mllm + from ..types.mllm_turn_detection import MllmTurnDetection + from ..types.mllm_turn_detection_mode import MllmTurnDetectionMode as GeneratedMllmTurnDetectionMode + from ..types.mllm_vendor import MllmVendor as GeneratedMllmVendor + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse, + ) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = Llm + LlmStyle = GeneratedLlmStyle + SttConfig = Asr + AsrConfig = SttConfig + SttVendor = typing.Any + TtsConfig = Tts + MllmConfig = Mllm + MllmVendor = GeneratedMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = MllmTurnDetection + MllmTurnDetectionMode = GeneratedMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + SessionStatus = ListAgentsResponseDataListItemStatus + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + + class ThinkOptions(typing_extensions.TypedDict, total=False): + on_listening_action: AgentThinkAgentManagementRequestOnListeningAction + on_thinking_action: AgentThinkAgentManagementRequestOnThinkingAction + on_speaking_action: AgentThinkAgentManagementRequestOnSpeakingAction + interruptable: bool + metadata: typing.Dict[str, str] + + + class GetTurnsOptions(typing_extensions.TypedDict, total=False): + page_index: int + page_size: int + + + class SayOptions(typing_extensions.TypedDict, total=False): + priority: SpeakAgentsRequestPriority + interruptable: bool + + + class SessionOptions(typing_extensions.TypedDict, total=False): + name: str + channel: str + token: str + agent_uid: str + remote_uids: typing.List[str] + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + # LLM sub-type aliases + LlmGreetingConfigs = typing.Dict[str, typing.Any] + LlmGreetingConfigsMode = typing.Any + McpServersItem = typing.Dict[str, typing.Any] + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + # Think type aliases and response + ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + ThinkResponse = AgentThinkAgentManagementResponse + + from .token import generate_convo_ai_token, _parse_numeric_uid, _validate_expires_in + + TurnDetectionLanguage = typing_extensions.Literal[ + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", + ] + + DEFAULT_TURN_DETECTION_LANGUAGE: TurnDetectionLanguage = "en-US" + TURN_DETECTION_LANGUAGE_VALUES: typing.Tuple[TurnDetectionLanguage, ...] = ( + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", + ) + _TURN_DETECTION_LANGUAGES = set(TURN_DETECTION_LANGUAGE_VALUES) + + + def _dump_optional_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + + def _is_turn_detection_language(value: typing.Any) -> bool: + return isinstance(value, str) and value in _TURN_DETECTION_LANGUAGES + + + def _validate_turn_detection_language(value: typing.Any) -> TurnDetectionLanguage: + if not _is_turn_detection_language(value): + raise ValueError(f"Invalid interaction language: {value}") + return value # type: ignore[return-value] + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Deprecated: + The Agent-level ``instructions``, ``greeting``, ``failure_message``, + ``max_history``, and ``greeting_configs`` convenience fields are kept + for compatibility. Configure those values on the LLM or MLLM vendor + instead. + + Examples + -------- + >>> from agora_agent import Agent, OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + greeting_configs: typing.Optional[LlmGreetingConfigs] = None, + pipeline_id: typing.Optional[str] = None, + ): + self._name = name + self._pipeline_id = pipeline_id + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + self._greeting_configs = greeting_configs + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + """Deprecated. Configure system messages on the LLM vendor instead.""" + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + """Deprecated. Configure the greeting on the LLM or MLLM vendor instead.""" + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_greeting_configs(self, configs: LlmGreetingConfigs) -> "Agent": + """Deprecated. Configure greeting playback on the LLM vendor instead.""" + new_agent = self._clone() + new_agent._greeting_configs = configs + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Deprecated. Configure the failure message on the LLM or MLLM vendor instead.""" + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Deprecated. Configure max history on the LLM vendor instead.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def pipeline_id(self) -> typing.Optional[str]: + """Published AI Studio pipeline ID used as this agent's base configuration.""" + return self._pipeline_id + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def greeting_configs(self) -> typing.Optional[LlmGreetingConfigs]: + return self._greeting_configs + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "pipeline_id": self._pipeline_id, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + "greeting_configs": self._greeting_configs, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + skip_vendor_validation_categories: typing.Optional[typing.AbstractSet[str]] = None, + allow_missing_vendor_categories: typing.Optional[typing.AbstractSet[str]] = None, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + uid=_parse_numeric_uid(agent_uid, "agent_uid"), + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + warnings.warn( + "skip_vendor_validation is deprecated and will be removed in a future release. " + "Use skip_vendor_validation_categories and allow_missing_vendor_categories instead.", + DeprecationWarning, + stacklevel=2, + ) + + skip_categories = set(skip_vendor_validation_categories or ()) + allow_missing_categories = set(allow_missing_vendor_categories or ()) + if skip_vendor_validation: + skip_categories.update({"asr", "llm", "tts"}) + allow_missing_categories.update({"asr", "llm", "tts"}) + + skip_asr_validation = skip_vendor_validation or "asr" in skip_categories + skip_llm_validation = skip_vendor_validation or "llm" in skip_categories + skip_tts_validation = skip_vendor_validation or "tts" in skip_categories + allow_missing_asr = "asr" in allow_missing_categories + allow_missing_llm = "llm" in allow_missing_categories + allow_missing_tts = "tts" in allow_missing_categories + + if not skip_asr_validation and (self._stt is not None or not allow_missing_asr): + base_kwargs["asr"] = self._resolve_asr_config() + base_kwargs["turn_detection"] = self._resolve_turn_detection_config() + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None and not (skip_tts_validation or allow_missing_tts): + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None and not (skip_llm_validation or allow_missing_llm): + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + if self._llm is not None and not skip_llm_validation: + base_kwargs["llm"] = self._resolve_llm_config() + if self._tts is not None and not skip_tts_validation: + base_kwargs["tts"] = self._tts + + return StartAgentsRequestProperties(**base_kwargs) + + def _resolve_llm_config(self) -> typing.Dict[str, typing.Any]: + llm_config = dict(self._llm or {}) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._greeting_configs is not None: + llm_config["greeting_configs"] = _dump_optional_model(self._greeting_configs) + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + return llm_config + + def _resolve_asr_config(self) -> typing.Dict[str, typing.Any]: + asr_config = dict(self._stt or {}) + asr_config.pop("language", None) + if not asr_config: + asr_config["vendor"] = "ares" + return asr_config + + def _resolve_turn_detection_config(self) -> TurnDetectionConfig: + existing_stt_language = self._stt.get("language") if self._stt is not None else None + existing_turn_detection_language = self._field_value(self._turn_detection, "language") + language = ( + existing_turn_detection_language + if existing_turn_detection_language is not None + else existing_stt_language + if _is_turn_detection_language(existing_stt_language) + else DEFAULT_TURN_DETECTION_LANGUAGE + ) + language = _validate_turn_detection_language(language) + if self._turn_detection is None: + return StartAgentsRequestPropertiesTurnDetection(language=language) + if isinstance(self._turn_detection, dict): + return typing.cast(TurnDetectionConfig, {**self._turn_detection, "language": language}) + return self._copy_model_update(self._turn_detection, {"language": language}) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._pipeline_id = self._pipeline_id + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + new_agent._greeting_configs = self._greeting_configs + return new_agent + src/agora_agent/agentkit/vendors/llm.py: | + from typing import Any, Dict, List, Optional, Union + + from pydantic import BaseModel, ConfigDict, Field, model_validator + + from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + StartAgentsRequestPropertiesLlmGreetingConfigs, + ) + from .base import BaseLLM + + LlmGreetingConfigs = Dict[str, Any] + _OPENAI_MANAGED_MODELS = {"gpt-4o-mini", "gpt-4.1-mini", "gpt-5-nano", "gpt-5-mini"} + + + def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Ensure each MCP server has transport set (API requires it). Default to streamable_http.""" + result = [] + for s in servers: + item = dict(s) + if item.get("transport") is None: + item["transport"] = "streamable_http" + result.append(item) + return result + + + def _dump_optional_model(value: Any) -> Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + class OpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: Optional[str] = Field(default=None, description="OpenAI API key") + model: str = Field(..., description="Model name") + base_url: Optional[str] = Field(default=None, description="Custom base URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + max_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + @model_validator(mode="after") + def _validate_byok_params(self) -> "OpenAIOptions": + if not self.model: + raise ValueError("OpenAI requires model") + if self.api_key is not None and self.base_url is None: + raise ValueError("OpenAI requires base_url when api_key is set") + if self.api_key is None and self.base_url is not None: + raise ValueError("OpenAI base_url is only valid when api_key is set") + if self.api_key is None and self.model.strip().lower() not in _OPENAI_MANAGED_MODELS: + raise ValueError("OpenAI requires api_key unless using a supported Agora-managed model") + if self.api_key is None and self.vendor is not None: + raise ValueError("OpenAI Agora-managed mode does not allow vendor") + return self + + class OpenAI(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # model is the default; explicit params entries extend/override it. + # This matches the TS SDK behaviour: { model, ...params }. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + + # Named fields take precedence over anything in the generic params dict. + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.base_url or "https://api.openai.com/v1/chat/completions", + "params": params, + "style": "openai", + "input_modalities": self.options.input_modalities or ["text"], + } + if self.options.api_key is not None: + config["api_key"] = self.options.api_key + if self.options.headers is not None: + config["headers"] = self.options.headers + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class AzureOpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Azure OpenAI API key") + model: str = Field(..., description="Azure deployment model name") + endpoint: str = Field(..., description="Azure endpoint URL") + deployment_name: str = Field(..., description="Azure deployment name") + api_version: str = Field(default="2024-08-01-preview", description="Azure API version") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + max_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class AzureOpenAI(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AzureOpenAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + url = ( + f"{self.options.endpoint}/openai/deployments/" + f"{self.options.deployment_name}/chat/completions" + f"?api-version={self.options.api_version}" + ) + config: Dict[str, Any] = { + "url": url, + "api_key": self.options.api_key, + "vendor": self.options.vendor or "azure", + "style": "openai", + "input_modalities": self.options.input_modalities or ["text"], + } + + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if params: + config["params"] = params + if self.options.headers is not None: + config["headers"] = self.options.headers + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class AnthropicOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anthropic API key") + model: str = Field(..., description="Model name") + url: str = Field(..., description="Anthropic messages endpoint URL") + max_tokens: int = Field(..., gt=0) + temperature: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Dict[str, str] = Field(..., description="Anthropic request headers") + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class Anthropic(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AnthropicOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.url, + "api_key": self.options.api_key, + "params": params, + "headers": self.options.headers, + "style": "anthropic", + "input_modalities": self.options.input_modalities or ["text"], + } + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class GeminiOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google AI API key") + model: str = Field(..., description="Model name") + url: Optional[str] = Field(default=None, description="Custom API endpoint URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_k: Optional[int] = Field(default=None, gt=0) + max_output_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class Gemini(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + if self.options.top_k is not None: + params["top_k"] = self.options.top_k + if self.options.max_output_tokens is not None: + params["max_output_tokens"] = self.options.max_output_tokens + + config: Dict[str, Any] = { + "url": self.options.url or "https://generativelanguage.googleapis.com/v1beta/models", + "api_key": self.options.api_key, + "params": params, + "style": "gemini", + "input_modalities": self.options.input_modalities or ["text"], + } + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class GroqOptions(OpenAIOptions): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Groq API key") + model: str = Field(..., description="Model name") + base_url: str = Field(..., description="Groq-compatible endpoint") + + + class Groq(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = GroqOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config = OpenAI(**_dump_optional_model(self.options)).to_config() + config["url"] = self.options.base_url + return config + + + class CustomLLMOptions(OpenAIOptions): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Custom LLM API key") + base_url: str = Field(..., description="OpenAI-compatible chat completions endpoint") + + + class CustomLLM(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = CustomLLMOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config = OpenAI(**_dump_optional_model(self.options)).to_config() + config["vendor"] = self.options.vendor or "custom" + return config + + + class VertexAILLMOptions(GeminiOptions): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Vertex AI access token or API key") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud location") + + + class VertexAILLM(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAILLMOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + options = _dump_optional_model(self.options) + options.pop("project_id", None) + options.pop("location", None) + config = Gemini(**options).to_config() + params = dict(config["params"]) + params["project_id"] = self.options.project_id + params["location"] = self.options.location + config["params"] = params + return config + + + class AmazonBedrockOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + access_key: str = Field(..., description="AWS access key ID") + secret_key: str = Field(..., description="AWS secret access key") + region: str = Field(..., description="AWS region") + model: str = Field(..., description="Amazon Bedrock model identifier") + max_tokens: Optional[int] = Field(default=None, gt=0) + url: Optional[str] = Field(default=None, description="Amazon Bedrock converse stream endpoint URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + + class AmazonBedrock(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AmazonBedrockOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.url or f"https://bedrock-runtime.{self.options.region}.amazonaws.com/model/{self.options.model}/converse-stream", + "access_key": self.options.access_key, + "secret_key": self.options.secret_key, + "region": self.options.region, + "model": self.options.model, + "params": params, + "style": "bedrock", + "input_modalities": self.options.input_modalities or ["text"], + } + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + return config + + + class DifyOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Dify API key") + url: str = Field(..., description="Dify workflow or chat endpoint") + model: str = Field(..., description="Dify model identifier") + user: Optional[str] = Field(default=None, description="Dify user identifier") + conversation_id: Optional[str] = Field(default=None, description="Dify conversation ID") + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0) + + + class Dify(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = DifyOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.user is not None: + params["user"] = self.options.user + if self.options.conversation_id is not None: + params["conversation_id"] = self.options.conversation_id + + config: Dict[str, Any] = { + "url": self.options.url, + "api_key": self.options.api_key, + "params": params, + "style": "dify", + "input_modalities": self.options.input_modalities or ["text"], + } + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + return config + src/agora_agent/agentkit/vendors/mllm.py: | + import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + + from ...types.mllm_turn_detection import MllmTurnDetection + from .base import BaseMLLM + + MllmTurnDetectionConfig = MllmTurnDetection + + + class OpenAIRealtimeOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="OpenAI API key") + model: Optional[str] = Field(default=None, description="Model name (e.g., gpt-4o-realtime-preview)") + voice: Optional[str] = Field(default=None, description="Voice identifier") + instructions: Optional[str] = Field(default=None, description="System instructions") + input_audio_transcription: Optional[Dict[str, Any]] = Field(default=None, description="Audio transcription settings") + url: Optional[str] = Field(default=None, description="WebSocket URL") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIRealtimeOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config: Dict[str, Any] = { + "vendor": "openai", + "api_key": self.options.api_key, + } + + if self.options.url is not None: + config["url"] = self.options.url + if ( + self.options.model is not None + or self.options.params is not None + or self.options.voice is not None + or self.options.instructions is not None + or self.options.input_audio_transcription is not None + ): + params: Dict[str, Any] = {} + if self.options.model is not None: + params["model"] = self.options.model + if self.options.params is not None: + params.update(self.options.params) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.input_audio_transcription is not None: + params["input_audio_transcription"] = self.options.input_audio_transcription + config["params"] = params + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + # xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + # is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + class XaiGrokOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="xAI API key") + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + class XaiGrok(BaseMLLM): + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + def __init__(self, **kwargs: Any): + self.options = XaiGrokOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.language is not None: + params["language"] = self.options.language + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate + + config: Dict[str, Any] = { + "vendor": "xai", + "api_key": self.options.api_key, + "url": self.options.url, + "params": params, + } + + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + model: str = Field(..., description="Model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud location/region") + adc_credentials_string: str = Field(..., description="Application Default Credentials JSON string") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name (e.g., Aoede, Charon)") + affective_dialog: Optional[bool] = Field(default=None, description="Enable affective dialog") + proactive_audio: Optional[bool] = Field(default=None, description="Enable proactive audio") + transcribe_agent: Optional[bool] = Field(default=None, description="Transcribe agent speech") + transcribe_user: Optional[bool] = Field(default=None, description="Transcribe user speech") + http_options: Optional[Dict[str, Any]] = Field(default=None, description="HTTP options") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # additional_params spread first so that explicit fields always win, + # matching the TypeScript SDK. + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["model"] = self.options.model + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.affective_dialog is not None: + params["affective_dialog"] = self.options.affective_dialog + if self.options.proactive_audio is not None: + params["proactive_audio"] = self.options.proactive_audio + if self.options.transcribe_agent is not None: + params["transcribe_agent"] = self.options.transcribe_agent + if self.options.transcribe_user is not None: + params["transcribe_user"] = self.options.transcribe_user + if self.options.http_options is not None: + params["http_options"] = self.options.http_options + + config: Dict[str, Any] = { + "vendor": "vertexai", + "project_id": self.options.project_id, + "location": self.options.location, + "adc_credentials_string": self.options.adc_credentials_string, + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class GeminiLiveOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google API key") + model: str = Field(..., description="Gemini Live model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name") + affective_dialog: Optional[bool] = Field(default=None, description="Enable affective dialog") + proactive_audio: Optional[bool] = Field(default=None, description="Enable proactive audio") + transcribe_agent: Optional[bool] = Field(default=None, description="Transcribe agent speech") + transcribe_user: Optional[bool] = Field(default=None, description="Transcribe user speech") + http_options: Optional[Dict[str, Any]] = Field(default=None, description="HTTP options") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiLiveOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = {} + if self.options.additional_params is not None: + params.update(self.options.additional_params) + params["model"] = self.options.model + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.affective_dialog is not None: + params["affective_dialog"] = self.options.affective_dialog + if self.options.proactive_audio is not None: + params["proactive_audio"] = self.options.proactive_audio + if self.options.transcribe_agent is not None: + params["transcribe_agent"] = self.options.transcribe_agent + if self.options.transcribe_user is not None: + params["transcribe_user"] = self.options.transcribe_user + if self.options.http_options is not None: + params["http_options"] = self.options.http_options + + config: Dict[str, Any] = { + "vendor": "gemini", + "api_key": self.options.api_key, + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + src/agora_agent/agentkit/vendors/stt.py: | + from typing import Any, Dict, Optional, Tuple + + from pydantic import BaseModel, ConfigDict, Field, model_validator + from typing_extensions import Literal + + from .base import BaseSTT + + TurnDetectionLanguage = Literal[ + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", + ] + + TURN_DETECTION_LANGUAGE_VALUES: Tuple[TurnDetectionLanguage, ...] = ( + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", + ) + _TURN_DETECTION_LANGUAGES = set(TURN_DETECTION_LANGUAGE_VALUES) + _DEEPGRAM_MANAGED_MODELS = {"nova-2", "nova-3"} + + + def _turn_detection_language(language: Optional[str]) -> Optional[TurnDetectionLanguage]: + if language in _TURN_DETECTION_LANGUAGES: + return language # type: ignore[return-value] + return None + + + class SpeechmaticsSTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Speechmatics API key") + language: str = Field(..., description="Language code (e.g., en, es, fr)") + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") + model: Optional[str] = Field(default=None, description="Model name") + uri: Optional[str] = Field(default=None, description="Speechmatics streaming WebSocket URL") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class SpeechmaticsSTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = SpeechmaticsSTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ + "api_key": self.options.api_key, + "language": self.options.language, + }) + if self.options.model is not None: + params["model"] = self.options.model + if self.options.uri is not None: + params["uri"] = self.options.uri + + config: Dict[str, Any] = { + "vendor": "speechmatics", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + + + class DeepgramSTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: Optional[str] = Field(default=None, description="Deepgram API key") + model: Optional[str] = Field(default=None, description="Model (e.g., nova-2, enhanced, base)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en-US)") + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") + smart_format: Optional[bool] = Field(default=None, description="Enable smart formatting") + punctuation: Optional[bool] = Field(default=None, description="Enable punctuation") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + @model_validator(mode="after") + def _validate_managed_model(self) -> "DeepgramSTTOptions": + if self.api_key is None and (self.model is None or self.model.strip().lower() not in _DEEPGRAM_MANAGED_MODELS): + raise ValueError("DeepgramSTT requires api_key unless using a supported Agora-managed model") + return self + + class DeepgramSTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = DeepgramSTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + + if self.options.api_key is not None: + params["key"] = self.options.api_key + if self.options.model is not None: + params["model"] = self.options.model + if self.options.language is not None: + params["language"] = self.options.language + if self.options.smart_format is not None: + params["smart_format"] = self.options.smart_format + if self.options.punctuation is not None: + params["punctuation"] = self.options.punctuation + config: Dict[str, Any] = { + "vendor": "deepgram", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + + + class MicrosoftSTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + key: str = Field(..., description="Azure subscription key") + region: str = Field(..., description="Azure region (e.g., eastus)") + language: str = Field(..., description="Language code (e.g., en-US)") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class MicrosoftSTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = MicrosoftSTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ + "key": self.options.key, + "region": self.options.region, + }) + if self.options.language is not None: + params["language"] = self.options.language + + config: Dict[str, Any] = { + "vendor": "microsoft", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + + + class OpenAISTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="OpenAI API key") + model: Optional[str] = Field(default=None, description="Model (default: whisper-1)") + language: Optional[str] = Field(default=None, description="Language code") + prompt: Optional[str] = Field(default=None, description="Prompt that guides OpenAI transcription") + input_audio_transcription: Optional[Dict[str, Any]] = Field(default=None, description="OpenAI transcription settings") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class OpenAISTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = OpenAISTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["api_key"] = self.options.api_key + + transcription = {"model": "whisper-1", **(self.options.input_audio_transcription or {})} + if self.options.model is not None: + transcription["model"] = self.options.model + if self.options.prompt is not None: + transcription["prompt"] = self.options.prompt + if self.options.language is not None: + transcription["language"] = self.options.language + params["input_audio_transcription"] = transcription + + config: Dict[str, Any] = { + "vendor": "openai", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + + + class GoogleSTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud region") + adc_credentials_string: str = Field(..., description="Google service account credentials JSON string") + language: str = Field(..., description="Language code (e.g., en-US)") + model: Optional[str] = Field(default=None, description="Recognition model") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class GoogleSTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = GoogleSTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ + "project_id": self.options.project_id, + "location": self.options.location, + "adc_credentials_string": self.options.adc_credentials_string, + }) + + if self.options.language is not None: + params["language"] = self.options.language + if self.options.model is not None: + params["model"] = self.options.model + + config: Dict[str, Any] = { + "vendor": "google", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + + + class AmazonSTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + access_key: str = Field(..., description="AWS Access Key ID") + secret_key: str = Field(..., description="AWS Secret Access Key") + region: str = Field(..., description="AWS region (e.g., us-east-1)") + language: str = Field(..., description="Language code") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class AmazonSTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = AmazonSTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ + "access_key_id": self.options.access_key, + "secret_access_key": self.options.secret_key, + "region": self.options.region, + }) + if self.options.language is not None: + params["language_code"] = self.options.language + + config: Dict[str, Any] = { + "vendor": "amazon", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + + + class AssemblyAISTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="AssemblyAI API key") + language: str = Field(..., description="Language code") + uri: Optional[str] = Field(default=None, description="AssemblyAI streaming WebSocket URL") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class AssemblyAISTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = AssemblyAISTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["api_key"] = self.options.api_key + if self.options.language is not None: + params["language"] = self.options.language + if self.options.uri is not None: + params["uri"] = self.options.uri + + config: Dict[str, Any] = { + "vendor": "assemblyai", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + + + class AresSTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + language: Optional[TurnDetectionLanguage] = Field(default=None, description="Language code") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class AresSTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = AresSTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config: Dict[str, Any] = {"vendor": "ares"} + if self.options.language is not None: + config["language"] = self.options.language + if self.options.additional_params: + config["params"] = self.options.additional_params + return config + + + class SarvamSTTOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Sarvam API key") + language: str = Field(..., description="Language code (e.g., en, hi, ta)") + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") + model: Optional[str] = Field(default=None, description="Model name") + additional_params: Optional[Dict[str, Any]] = Field(default=None) + + class SarvamSTT(BaseSTT): + def __init__(self, **kwargs: Any): + self.options = SarvamSTTOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ + "api_key": self.options.api_key, + "language": self.options.language, + }) + if self.options.model is not None: + params["model"] = self.options.model + + config: Dict[str, Any] = { + "vendor": "sarvam", + "params": params, + } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config + - id: patch-617ee134 + content_hash: sha256:ea2d27ba8019bf09ce5766d322eb7218fcee0a90124e823ba16c4e45dc1af5a9 + original_commit: 617ee134d9dafbf4f4f83d5e98b80ad110c6e1bf + original_message: "feat(agentkit): support agent-level pipeline_id" + original_author: Hermes (agora) + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/reference/agent.md + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + - tests/custom/test_pipeline_id.py + patch_content: | + diff --git a/docs/reference/agent.md b/docs/reference/agent.md + index 187229f..86d4fbd 100644 + --- a/docs/reference/agent.md + +++ b/docs/reference/agent.md + @@ -27,12 +27,14 @@ Agent( + labels: Optional[Dict[str, str]] = None, + rtc: Optional[RtcConfig] = None, + filler_words: Optional[FillerWordsConfig] = None, + + pipeline_id: Optional[str] = None, + ) + ``` + + | Parameter | Type | Default | Description | + |---|---|---|---| + | `name` | `Optional[str]` | `None` | Agent name, used as default session name | + +| `pipeline_id` | `Optional[str]` | `None` | Published AI Studio pipeline ID used as this agent's base configuration | + | `instructions` | `Optional[str]` | `None` | Deprecated. Use LLM vendor `system_messages` instead. | + | `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Interaction language and turn detection configuration | + | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | + @@ -47,6 +49,8 @@ Agent( + | `rtc` | `Optional[RtcConfig]` | `None` | RTC media encryption | + | `filler_words` | `Optional[FillerWordsConfig]` | `None` | Filler words while waiting for LLM | + + +`pipeline_id` is an AI Studio base configuration. Explicit Agent config such as `with_llm()`, `with_tts()`, `with_stt()`, `with_mllm()`, `advanced_features`, and other builder options may send fields in `properties` that override the saved pipeline settings. Session-level `pipeline_id` overrides the agent-level value. + + + The Agent-level `instructions`, `greeting`, `failure_message`, `max_history`, and `greeting_configs` fields are compatibility shims. New code should configure those values on the LLM or MLLM vendor because that matches the core request schema. + + ## Builder Methods + @@ -202,6 +206,8 @@ create_session( + token: Optional[str] = None, + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + + preset: Optional[Union[str, Sequence[str]]] = None, + + pipeline_id: Optional[str] = None, + expires_in: Optional[int] = None, + ) -> AgentSession + ``` + @@ -219,6 +225,10 @@ Creates an `AgentSession` bound to the given client and channel. + | `expires_in` | `Optional[int]` | No | Token lifetime in seconds (default: `86400` = 24 h, Agora max). Only applies when the token is auto-generated. Use `expires_in_hours()` or `expires_in_minutes()` for clarity. Valid range: 1–86400. | + | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | + | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | + +| `preset` | `Optional[Union[str, Sequence[str]]]` | No | Advanced preset value for project-specific routing | + +| `pipeline_id` | `Optional[str]` | No | Published AI Studio pipeline ID for this session. Overrides `agent.pipeline_id`. | + + + +`pipeline_id` is sent as the top-level `/join` field `pipeline_id`, not inside `properties`. + + **Returns:** `AgentSession` + + diff --git a/docs/reference/session.md b/docs/reference/session.md + index 63402f6..76e1367 100644 + --- a/docs/reference/session.md + +++ b/docs/reference/session.md + @@ -33,6 +33,11 @@ AgentSession( + token: Optional[str] = None, + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + + preset: Optional[Union[str, Sequence[str]]] = None, + + pipeline_id: Optional[str] = None, + + expires_in: Optional[int] = None, + + debug: Optional[bool] = None, + + warn: Optional[Callable[[str], None]] = None, + ) + ``` + + @@ -51,6 +56,13 @@ AgentSession( + | `token` | `Optional[str]` | No | Pre-built RTC token | + | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | + | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | + +| `preset` | `Optional[Union[str, Sequence[str]]]` | No | Advanced preset value for project-specific routing | + +| `pipeline_id` | `Optional[str]` | No | Published AI Studio pipeline ID for this session. Overrides `agent.pipeline_id`. | + +| `expires_in` | `Optional[int]` | No | Auto-generated token lifetime in seconds | + +| `debug` | `Optional[bool]` | No | Enable debug logging of the start request | + +| `warn` | `Optional[Callable[[str], None]]` | No | Custom warning sink | + + + +`pipeline_id` is sent as the top-level `/join` field `pipeline_id`, not inside `properties`. If unset, `AgentSession.start()` uses the agent-level value from `Agent(..., pipeline_id=...)`. + + ## Methods + + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index fea1f0d..0a652db 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -343,8 +343,10 @@ class Agent: + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + greeting_configs: typing.Optional[LlmGreetingConfigs] = None, + + pipeline_id: typing.Optional[str] = None, + ): + self._name = name + + self._pipeline_id = pipeline_id + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + @@ -609,6 +611,11 @@ class Agent: + def name(self) -> typing.Optional[str]: + return self._name + + + @property + + def pipeline_id(self) -> typing.Optional[str]: + + """Published AI Studio pipeline ID used as this agent's base configuration.""" + + return self._pipeline_id + + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + @@ -693,6 +700,7 @@ class Agent: + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + + "pipeline_id": self._pipeline_id, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + @@ -945,6 +953,7 @@ class Agent: + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + + new_agent._pipeline_id = self._pipeline_id + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index e113dc1..5c866ac 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -52,7 +52,8 @@ class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + + Optional fields + --------------- + - app_certificate, token, idle_timeout, enable_string_uid, expires_in + + app_certificate, token, idle_timeout, enable_string_uid, preset, + + pipeline_id, expires_in, debug, warn + """ + + app_certificate: str + @@ -290,14 +291,18 @@ class _AgentSessionBase: + return True + return mllm is not None + + - def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + + def _build_start_properties( + + self, + + token_opts: typing.Dict[str, typing.Any], + + skip_vendor_validation: bool, + + ) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + - skip_vendor_validation=True, + + skip_vendor_validation=skip_vendor_validation, + **token_opts, + ) + properties = self._dump_model(base_properties) + @@ -445,6 +450,7 @@ class AgentSession(_AgentSessionBase): + self._status = "starting" + + try: + + pipeline_id = self._pipeline_id if self._pipeline_id is not None else self._agent.pipeline_id + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + @@ -454,7 +460,7 @@ class AgentSession(_AgentSessionBase): + "expires_in": self._expires_in, + } + + - properties = self._build_start_properties(token_opts) + + properties = self._build_start_properties(token_opts, skip_vendor_validation=bool(self._preset or pipeline_id)) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + @@ -466,7 +472,7 @@ class AgentSession(_AgentSessionBase): + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + - "pipeline_id": self._pipeline_id, + + "pipeline_id": pipeline_id, + "properties": resolved_properties, + }) + + @@ -480,7 +486,7 @@ class AgentSession(_AgentSessionBase): + name=self._name, + properties=request_properties, + preset=resolved_preset, + - pipeline_id=self._pipeline_id, + + pipeline_id=pipeline_id, + request_options=self._request_options(), + ) + + @@ -766,6 +772,7 @@ class AsyncAgentSession(_AgentSessionBase): + self._status = "starting" + + try: + + pipeline_id = self._pipeline_id if self._pipeline_id is not None else self._agent.pipeline_id + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + @@ -775,7 +782,7 @@ class AsyncAgentSession(_AgentSessionBase): + "expires_in": self._expires_in, + } + + - properties = self._build_start_properties(token_opts) + + properties = self._build_start_properties(token_opts, skip_vendor_validation=bool(self._preset or pipeline_id)) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + @@ -787,7 +794,7 @@ class AsyncAgentSession(_AgentSessionBase): + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + - "pipeline_id": self._pipeline_id, + + "pipeline_id": pipeline_id, + "properties": resolved_properties, + }) + + @@ -801,7 +808,7 @@ class AsyncAgentSession(_AgentSessionBase): + name=self._name, + properties=request_properties, + preset=resolved_preset, + - pipeline_id=self._pipeline_id, + + pipeline_id=pipeline_id, + request_options=self._request_options(), + ) + + diff --git a/tests/custom/test_pipeline_id.py b/tests/custom/test_pipeline_id.py + new file mode 100644 + index 0000000..c6c8c8f + --- /dev/null + +++ b/tests/custom/test_pipeline_id.py + @@ -0,0 +1,123 @@ + +import pytest + + + +from agora_agent import Agent + + + + + +def dump(value): + + if hasattr(value, "model_dump"): + + return value.model_dump(exclude_none=True) + + if hasattr(value, "dict"): + + return value.dict(exclude_none=True) + + return value + + + + + +class StartResponse: + + agent_id = "agent-id" + + + + + +class FakeAgentsClient: + + def __init__(self): + + self.calls = [] + + + + def start(self, appid, **kwargs): + + self.calls.append({"appid": appid, **kwargs}) + + return StartResponse() + + + + + +class FakeAsyncAgentsClient: + + def __init__(self): + + self.calls = [] + + + + async def start(self, appid, **kwargs): + + self.calls.append({"appid": appid, **kwargs}) + + return StartResponse() + + + + + +class FakeClient: + + app_id = "appid" + + app_certificate = None + + + + def __init__(self, agents): + + self.agents = agents + + + + + +def start_agent(agent, **overrides): + + agents = FakeAgentsClient() + + client = FakeClient(agents) + + options = { + + "channel": "channel", + + "token": "token", + + "agent_uid": "1", + + "remote_uids": ["100"], + + **overrides, + + } + + + + agent_id = agent.create_session(client, **options).start() + + + + assert agent_id == "agent-id" + + assert len(agents.calls) == 1 + + return agents.calls[0] + + + + + +def test_agent_pipeline_id_sends_top_level_pipeline_id() -> None: + + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + + + assert call["appid"] == "appid" + + assert call["name"] == "support" + + assert call["pipeline_id"] == "studio-pipeline-id" + + properties = dump(call["properties"]) + + assert properties["channel"] == "channel" + + assert properties["token"] == "token" + + assert properties["agent_rtc_uid"] == "1" + + assert properties["remote_rtc_uids"] == ["100"] + + + + + +def test_session_pipeline_id_overrides_agent_pipeline_id() -> None: + + call = start_agent( + + Agent(name="support", pipeline_id="agent-pipeline"), + + pipeline_id="session-pipeline", + + ) + + + + assert call["pipeline_id"] == "session-pipeline" + + + + + +def test_agent_pipeline_id_skips_missing_vendor_validation() -> None: + + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + + + assert call["pipeline_id"] == "studio-pipeline-id" + + + + + +def test_pipeline_id_is_not_sent_inside_properties() -> None: + + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + + + assert call["pipeline_id"] == "studio-pipeline-id" + + assert "pipeline_id" not in dump(call["properties"]) + + + + + +def test_pipeline_id_survives_builder_clone() -> None: + + agent = Agent(name="support", pipeline_id="studio-pipeline-id").with_tools(True) + + + + assert agent.pipeline_id == "studio-pipeline-id" + + call = start_agent(agent) + + + + assert call["pipeline_id"] == "studio-pipeline-id" + + assert dump(call["properties"])["advanced_features"] == {"enable_tools": True} + + + + + +@pytest.mark.asyncio + +async def test_async_session_uses_agent_pipeline_id() -> None: + + agents = FakeAsyncAgentsClient() + + client = FakeClient(agents) + + agent = Agent(name="support", pipeline_id="studio-pipeline-id") + + + + agent_id = await agent.create_async_session( + + client, + + channel="channel", + + token="token", + + agent_uid="1", + + remote_uids=["100"], + + ).start() + + + + assert agent_id == "agent-id" + + assert agents.calls[0]["pipeline_id"] == "studio-pipeline-id" + + assert "pipeline_id" not in dump(agents.calls[0]["properties"]) + theirs_snapshot: + docs/reference/agent.md: | + --- + sidebar_position: 2 + title: Agent + description: Full API reference for the Python Agent builder class. + --- + + # Agent Reference + + **Import:** `from agora_agent import Agent` + + ## Constructor + + + ```python + Agent( + name: Optional[str] = None, + instructions: Optional[str] = None, + turn_detection: Optional[TurnDetectionConfig] = None, + interruption: Optional[InterruptionConfig] = None, + sal: Optional[SalConfig] = None, + advanced_features: Optional[Dict[str, Any]] = None, + parameters: Optional[SessionParams] = None, + greeting: Optional[str] = None, + failure_message: Optional[str] = None, + max_history: Optional[int] = None, + geofence: Optional[GeofenceConfig] = None, + labels: Optional[Dict[str, str]] = None, + rtc: Optional[RtcConfig] = None, + filler_words: Optional[FillerWordsConfig] = None, + pipeline_id: Optional[str] = None, + ) + ``` + + | Parameter | Type | Default | Description | + |---|---|---|---| + | `name` | `Optional[str]` | `None` | Agent name, used as default session name | + | `pipeline_id` | `Optional[str]` | `None` | Published AI Studio pipeline ID used as this agent's base configuration | + | `instructions` | `Optional[str]` | `None` | Deprecated. Use LLM vendor `system_messages` instead. | + | `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Interaction language and turn detection configuration | + | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | + | `sal` | `Optional[SalConfig]` | `None` | Speech Activity Level configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | `None` | Advanced features dict (e.g., `{'enable_rtm': True}`) | + | `parameters` | `Optional[SessionParams]` | `None` | Additional session parameters | + | `greeting` | `Optional[str]` | `None` | Deprecated. Use LLM/MLLM vendor `greeting_message` instead. | + | `failure_message` | `Optional[str]` | `None` | Deprecated. Use LLM/MLLM vendor `failure_message` instead. | + | `max_history` | `Optional[int]` | `None` | Deprecated. Use LLM vendor `max_history` instead. | + | `geofence` | `Optional[GeofenceConfig]` | `None` | Regional access restriction | + | `labels` | `Optional[Dict[str, str]]` | `None` | Custom key-value labels (returned in callbacks) | + | `rtc` | `Optional[RtcConfig]` | `None` | RTC media encryption | + | `filler_words` | `Optional[FillerWordsConfig]` | `None` | Filler words while waiting for LLM | + + `pipeline_id` is an AI Studio base configuration. Explicit Agent config such as `with_llm()`, `with_tts()`, `with_stt()`, `with_mllm()`, `advanced_features`, and other builder options may send fields in `properties` that override the saved pipeline settings. Session-level `pipeline_id` overrides the agent-level value. + + The Agent-level `instructions`, `greeting`, `failure_message`, `max_history`, and `greeting_configs` fields are compatibility shims. New code should configure those values on the LLM or MLLM vendor because that matches the core request schema. + + ## Builder Methods + + All builder methods return a new `Agent` instance (immutable pattern). + + ### `with_llm(vendor: BaseLLM) -> Agent` + + Set the LLM vendor for cascading flow. + + + ```python + from agora_agent import OpenAI + agent = Agent().with_llm(OpenAI(api_key='your-key', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini')) + ``` + + ### `with_tts(vendor: BaseTTS) -> Agent` + + Set the TTS vendor. Records the vendor's `sample_rate` for avatar validation. + + + ```python + from agora_agent import ElevenLabsTTS + agent = Agent().with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1')) + ``` + + ### `with_stt(vendor: BaseSTT) -> Agent` + + Set the STT (ASR) vendor. + + + ```python + from agora_agent import DeepgramSTT + agent = Agent().with_stt(DeepgramSTT(api_key='your-key', language='en-US')) + ``` + + ### `with_mllm(vendor: BaseMLLM) -> Agent` + + Set the MLLM vendor for multimodal flow. Calling `with_mllm()` automatically sets `mllm.enable = True`. MLLM sessions do not require TTS, STT, or LLM vendors. + + + ```python + from agora_agent import OpenAIRealtime + agent = Agent().with_mllm(OpenAIRealtime(api_key='your-key')) + ``` + + ### `with_avatar(vendor: BaseAvatar) -> Agent` + + Set the avatar vendor for the cascading ASR + LLM + TTS pipeline. Avatars are not supported when MLLM is enabled — combining `with_mllm()` and an enabled `with_avatar()` is rejected at `to_properties()` and `AgentSession.start()`. A disabled avatar (`enable=False`) is allowed alongside MLLM. + + Raises `ValueError` if the TTS sample rate does not match the avatar's `required_sample_rate`. + + + ```python + from agora_agent import HeyGenAvatar + agent = agent.with_avatar(HeyGenAvatar(api_key='your-key', quality='medium', agora_uid='2')) + ``` + + **Raises:** `ValueError` — `"Avatar requires TTS sample rate of {required} Hz, but TTS is configured with {actual} Hz. Please update your TTS sample_rate to {required}."` + + ### `with_turn_detection(config: TurnDetectionConfig) -> Agent` + + Override cascading-flow turn detection settings. Use `language` for the Agora interaction language, `config.start_of_speech` and `config.end_of_speech` for SOS/EOS detection, `with_interruption()` for interruption behavior, and MLLM vendor `turn_detection` for MLLM turn detection. + + Pause-state detection is configured under semantic end-of-speech: + + ```python + agent = agent.with_turn_detection({ + "mode": "default", + "config": { + "end_of_speech": { + "mode": "semantic", + "semantic_config": { + "pause_state_enabled": True, + }, + }, + }, + }) + ``` + + ### `with_interruption(config: InterruptionConfig) -> Agent` + + Configure unified interruption behavior using the top-level `interruption` object. Use this for `start_of_speech` and `keywords` interruption modes. + + ### `with_instructions(instructions: str) -> Agent` + + Deprecated. Configure `system_messages` on the LLM vendor instead. + + ### `with_greeting(greeting: str) -> Agent` + + Deprecated. Configure `greeting_message` on the LLM or MLLM vendor instead. + + ### `with_name(name: str) -> Agent` + + Override the agent name. + + ### `with_sal(config: SalConfig) -> Agent` + + Set SAL (Selective Attention Locking) configuration. + + ### `with_advanced_features(features: AdvancedFeatures) -> Agent` + + Set advanced features (e.g. `{'enable_rtm': True}`). + + When `enable_rtm=True`, AgentKit defaults `parameters.data_channel` to `"rtm"` unless you explicitly set another data channel. + + ### `with_tools(enabled: bool = True) -> Agent` + + Enable or disable MCP tool invocation by setting `advanced_features.enable_tools`. + + ### `with_parameters(parameters: SessionParams) -> Agent` + + Set session parameters (silence config, farewell config, data channel, audio scenario, etc.). + + ### `with_audio_scenario(audio_scenario: ParametersAudioScenario) -> Agent` + + Set `parameters.audio_scenario` without replacing existing session parameters. + + ### `with_failure_message(message: str) -> Agent` + + Deprecated. Configure `failure_message` on the LLM or MLLM vendor instead. + + ### `with_max_history(max_history: int) -> Agent` + + Deprecated. Configure `max_history` on the LLM vendor instead. + + ### `with_geofence(geofence: GeofenceConfig) -> Agent` + + Set geofence configuration (restricts backend server regions). + + ### `with_labels(labels: Dict[str, str]) -> Agent` + + Set custom labels (key-value pairs returned in notification callbacks). + + ### `with_rtc(rtc: RtcConfig) -> Agent` + + Set RTC configuration. + + ### `with_filler_words(filler_words: FillerWordsConfig) -> Agent` + + Set filler words configuration (played while waiting for LLM response). + + ## `create_session()` + + + ```python + create_session( + client: Any, + channel: str, + agent_uid: str, + remote_uids: List[str], + name: Optional[str] = None, + token: Optional[str] = None, + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + preset: Optional[Union[str, Sequence[str]]] = None, + pipeline_id: Optional[str] = None, + expires_in: Optional[int] = None, + ) -> AgentSession + ``` + + Creates an `AgentSession` bound to the given client and channel. + + | Parameter | Type | Required | Description | + |---|---|---|---| + | `client` | `Agora` or `AsyncAgora` | Yes | Authenticated client | + | `channel` | `str` | Yes | Channel name | + | `agent_uid` | `str` | Yes | UID for the agent | + | `remote_uids` | `List[str]` | Yes | UIDs of remote participants | + | `name` | `Optional[str]` | No | Session name (defaults to agent name) | + | `token` | `Optional[str]` | No | Pre-built RTC+RTM token | + | `expires_in` | `Optional[int]` | No | Token lifetime in seconds (default: `86400` = 24 h, Agora max). Only applies when the token is auto-generated. Use `expires_in_hours()` or `expires_in_minutes()` for clarity. Valid range: 1–86400. | + | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | + | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | + | `preset` | `Optional[Union[str, Sequence[str]]]` | No | Advanced preset value for project-specific routing | + | `pipeline_id` | `Optional[str]` | No | Published AI Studio pipeline ID for this session. Overrides `agent.pipeline_id`. | + + `pipeline_id` is sent as the top-level `/join` field `pipeline_id`, not inside `properties`. + + **Returns:** `AgentSession` + + ## `to_properties()` + + Converts the agent configuration into a `StartAgentsRequestProperties` object for the Agora API. Called internally by `AgentSession.start()`. + + + ```python + to_properties( + channel: str, + agent_uid: str, + remote_uids: List[str], + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + token: Optional[str] = None, + app_id: Optional[str] = None, + app_certificate: Optional[str] = None, + expires_in: Optional[int] = None, + ) -> StartAgentsRequestProperties + ``` + + **Raises:** `ValueError` if neither `token` nor `app_id`+`app_certificate` is provided, or if required vendors (LLM, TTS) are missing in cascading mode. + + ## Properties + + | Property | Type | Description | + |---|---|---| + | `name` | `Optional[str]` | Agent name | + | `instructions` | `Optional[str]` | Deprecated Agent-level system prompt | + | `greeting` | `Optional[str]` | Deprecated Agent-level greeting message | + | `failure_message` | `Optional[str]` | Deprecated Agent-level failure message | + | `max_history` | `Optional[int]` | Deprecated Agent-level max history | + | `llm` | `Optional[Dict[str, Any]]` | LLM config dict (from `to_config()`) | + | `tts` | `Optional[Dict[str, Any]]` | TTS config dict | + | `stt` | `Optional[Dict[str, Any]]` | STT config dict | + | `mllm` | `Optional[Dict[str, Any]]` | MLLM config dict | + | `avatar` | `Optional[Dict[str, Any]]` | Avatar config dict | + | `turn_detection` | `Optional[TurnDetectionConfig]` | Interaction language and turn detection settings | + | `sal` | `Optional[SalConfig]` | SAL configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | Advanced features | + | `parameters` | `Optional[SessionParams]` | Session parameters | + | `geofence` | `Optional[GeofenceConfig]` | Geofence configuration | + | `labels` | `Optional[Dict[str, str]]` | Custom labels | + | `rtc` | `Optional[RtcConfig]` | RTC configuration | + | `filler_words` | `Optional[FillerWordsConfig]` | Filler words configuration | + | `config` | `Dict[str, Any]` | Full configuration dict | + + ## Type aliases + + Public aliases over Fern-generated types: `LlmConfig`, `SttConfig`, `AsrConfig` (= `SttConfig`), `MllmConfig`, `AvatarConfig`, session/conversation types, and think types (`ThinkOnListeningAction`, etc.). + + Think value constants: `ThinkOnListeningActionInject`, `ThinkOnListeningActionInterrupt`, `ThinkOnListeningActionIgnore`, `ThinkOnThinkingActionInterrupt`, `ThinkOnThinkingActionIgnore`, `ThinkOnSpeakingActionInterrupt`, `ThinkOnSpeakingActionIgnore`. + docs/reference/session.md: | + --- + sidebar_position: 3 + title: AgentSession + description: Full API reference for the Python AgentSession class. + --- + + # AgentSession / AsyncAgentSession Reference + + **Import:** + + ```python + from agora_agent import AgentSession + from agora_agent import AsyncAgentSession + # or from top-level: + from agora_agent import AgentSession, AsyncAgentSession + ``` + + ## Constructor + + Sessions are normally created via `Agent.create_session()`. Direct construction is available for advanced use: + + + ```python + AgentSession( + client: Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: List[str], + app_certificate: Optional[str] = None, + token: Optional[str] = None, + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + preset: Optional[Union[str, Sequence[str]]] = None, + pipeline_id: Optional[str] = None, + expires_in: Optional[int] = None, + debug: Optional[bool] = None, + warn: Optional[Callable[[str], None]] = None, + ) + ``` + + `AsyncAgentSession` has the same constructor signature. + + | Parameter | Type | Required | Description | + |---|---|---|---| + | `client` | `Agora` or `AsyncAgora` | Yes | Authenticated client | + | `agent` | `Agent` | Yes | Agent configuration | + | `app_id` | `str` | Yes | Agora App ID | + | `name` | `str` | Yes | Session name | + | `channel` | `str` | Yes | Channel name | + | `agent_uid` | `str` | Yes | UID for the agent | + | `remote_uids` | `List[str]` | Yes | UIDs of remote participants | + | `app_certificate` | `Optional[str]` | No | App Certificate (for auto token generation) | + | `token` | `Optional[str]` | No | Pre-built RTC token | + | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | + | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | + | `preset` | `Optional[Union[str, Sequence[str]]]` | No | Advanced preset value for project-specific routing | + | `pipeline_id` | `Optional[str]` | No | Published AI Studio pipeline ID for this session. Overrides `agent.pipeline_id`. | + | `expires_in` | `Optional[int]` | No | Auto-generated token lifetime in seconds | + | `debug` | `Optional[bool]` | No | Enable debug logging of the start request | + | `warn` | `Optional[Callable[[str], None]]` | No | Custom warning sink | + + `pipeline_id` is sent as the top-level `/join` field `pipeline_id`, not inside `properties`. If unset, `AgentSession.start()` uses the agent-level value from `Agent(..., pipeline_id=...)`. + + ## Methods + + ### `start()` + + Start the agent session. Generates an RTC token if not provided, validates avatar/TTS config for cascading sessions, and calls the Agora API. MLLM sessions do not require TTS; an enabled avatar is rejected when MLLM is configured (a disabled avatar is allowed). + + | | Sync (`AgentSession`) | Async (`AsyncAgentSession`) | + |---|---|---| + | **Signature** | `start() -> str` | `async start() -> str` | + | **Returns** | Agent ID | Agent ID | + | **Raises** | `RuntimeError` if not in `idle`, `stopped`, or `error` state | Same | + | **Raises** | `ValueError` if avatar/TTS sample rate mismatch or an enabled avatar is used with MLLM | Same | + + + ```python + # Sync + agent_id = session.start() + + # Async + agent_id = await session.start() + ``` + + ### `stop()` + + Stop the agent session. If the agent has already stopped (404 from API), transitions to `stopped` without raising. + + | | Sync | Async | + |---|---|---| + | **Signature** | `stop() -> None` | `async stop() -> None` | + | **Raises** | `RuntimeError` if not in `running` state | Same | + + + ```python + # Sync + session.stop() + + # Async + await session.stop() + ``` + + ### `say(text, priority=None, interruptable=None)` + + Send text to be spoken by the agent's TTS. + + | | Sync | Async | + |---|---|---| + | **Signature** | `say(text: str, priority: Optional[str] = None, interruptable: Optional[bool] = None) -> None` | Same with `async` | + | **Raises** | `RuntimeError` if not in `running` state | Same | + + | Parameter | Type | Required | Description | + |---|---|---|---| + | `text` | `str` | Yes | Text to speak | + | `priority` | `str` | No | `INTERRUPT`, `APPEND`, or `IGNORE` | + | `interruptable` | `bool` | No | Whether the message can be interrupted | + + + ```python + # Sync + session.say('Hello!', priority='INTERRUPT', interruptable=False) + + # Async + await session.say('Hello!', priority='INTERRUPT', interruptable=False) + ``` + + ### `interrupt()` + + Interrupt the agent while speaking or thinking. + + | | Sync | Async | + |---|---|---| + | **Signature** | `interrupt() -> None` | `async interrupt() -> None` | + | **Raises** | `RuntimeError` if not in `running` state | Same | + + + ```python + # Sync + session.interrupt() + + # Async + await session.interrupt() + ``` + + ### `update(properties)` + + Update the agent configuration at runtime. + + | | Sync | Async | + |---|---|---| + | **Signature** | `update(properties: Any) -> None` | `async update(properties: Any) -> None` | + | **Raises** | `RuntimeError` if not in `running` state | Same | + + + ```python + from agora_agent.agents.types import UpdateAgentsRequestProperties + + # Sync + session.update(properties) + + # Async + await session.update(properties) + ``` + + ### `think(text, ...)` + + Inject a custom text instruction into the running agent. + + In API v2.7, omitting `on_listening_action` uses the server default `interrupt`. Pass `on_listening_action='inject'` explicitly to preserve the pre-v2.7 behavior. + + ```python + session.think('Summarize the last answer', on_listening_action='inject') + ``` + + ### `get_history()` + + Retrieve the conversation history. + + | | Sync | Async | + |---|---|---| + | **Signature** | `get_history() -> Any` | `async get_history() -> Any` | + | **Raises** | `RuntimeError` if no agent ID | Same | + + + ```python + # Sync + history = session.get_history() + + # Async + history = await session.get_history() + ``` + + ### `get_info()` + + Retrieve the current session info. + + | | Sync | Async | + |---|---|---| + | **Signature** | `get_info() -> Any` | `async get_info() -> Any` | + | **Raises** | `RuntimeError` if no agent ID | Same | + + + ```python + # Sync + info = session.get_info() + + # Async + info = await session.get_info() + ``` + + ### `get_turns(page_index=None, page_size=None)` + + Retrieve paginated turn analytics for a completed or running session. In v2.7, the API defaults to page 1 and up to 50 turns per page. Responses include `agent_id`, `name`, `channel`, `total_turn_count`, `pagination`, and `turns`. + + ```python + page = session.get_turns(page_index=1, page_size=50) + ``` + + ### `get_all_turns(page_size=None)` + + Fetch all turn pages and return a single `GetTurnsAgentsResponse` with the combined `turns` list. + + ```python + all_turns = session.get_all_turns(page_size=50) + ``` + + ### `on(event, handler)` + + Register an event handler. This method is synchronous on both `AgentSession` and `AsyncAgentSession`. + + + ```python + session.on('started', lambda data: print(f'Started: {data}')) + ``` + + | Parameter | Type | Description | + |---|---|---| + | `event` | `str` | Event type: `started`, `stopped`, or `error` | + | `handler` | `Callable[..., None]` | Callback function | + + ### `off(event, handler)` + + Remove a previously registered event handler. + + + ```python + session.off('started', my_handler) + ``` + + ## Properties + + | Property | Type | Description | + |---|---|---| + | `id` | `Optional[str]` | Agent ID (set after `start()`) | + | `status` | `str` | Current state: `idle`, `starting`, `running`, `stopping`, `stopped`, `error` | + | `agent` | `Agent` | The agent configuration | + | `app_id` | `str` | Agora App ID | + | `raw` | `AgentsClient` / `AsyncAgentsClient` | Direct access to Fern-generated agents client | + + ## State Transitions + + | Current State | Allowed Actions | + |---|---| + | `idle` | `start()` | + | `starting` | (waiting for API) | + | `running` | `stop()`, `say()`, `interrupt()`, `update()`, `get_history()`, `get_info()` | + | `stopping` | (waiting for API) | + | `stopped` | `start()` (restart) | + | `error` | `start()` (retry) | + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.list_agents_response_data_list_item_status import ListAgentsResponseDataListItemStatus + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..types.asr import Asr + from ..types.llm import Llm + from ..types.llm_style import LlmStyle as GeneratedLlmStyle + from ..types.mllm import Mllm + from ..types.mllm_turn_detection import MllmTurnDetection + from ..types.mllm_turn_detection_mode import MllmTurnDetectionMode as GeneratedMllmTurnDetectionMode + from ..types.mllm_vendor import MllmVendor as GeneratedMllmVendor + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse, + ) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = Llm + LlmStyle = GeneratedLlmStyle + SttConfig = Asr + AsrConfig = SttConfig + SttVendor = typing.Any + TtsConfig = Tts + MllmConfig = Mllm + MllmVendor = GeneratedMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = MllmTurnDetection + MllmTurnDetectionMode = GeneratedMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + SessionStatus = ListAgentsResponseDataListItemStatus + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + + class ThinkOptions(typing_extensions.TypedDict, total=False): + on_listening_action: AgentThinkAgentManagementRequestOnListeningAction + on_thinking_action: AgentThinkAgentManagementRequestOnThinkingAction + on_speaking_action: AgentThinkAgentManagementRequestOnSpeakingAction + interruptable: bool + metadata: typing.Dict[str, str] + + + class GetTurnsOptions(typing_extensions.TypedDict, total=False): + page_index: int + page_size: int + + + class SayOptions(typing_extensions.TypedDict, total=False): + priority: SpeakAgentsRequestPriority + interruptable: bool + + + class SessionOptions(typing_extensions.TypedDict, total=False): + name: str + channel: str + token: str + agent_uid: str + remote_uids: typing.List[str] + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + # LLM sub-type aliases + LlmGreetingConfigs = typing.Dict[str, typing.Any] + LlmGreetingConfigsMode = typing.Any + McpServersItem = typing.Dict[str, typing.Any] + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + # Think type aliases and response + ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + ThinkResponse = AgentThinkAgentManagementResponse + + from .token import generate_convo_ai_token, _parse_numeric_uid, _validate_expires_in + + TurnDetectionLanguage = typing_extensions.Literal[ + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", + ] + + DEFAULT_TURN_DETECTION_LANGUAGE: TurnDetectionLanguage = "en-US" + TURN_DETECTION_LANGUAGE_VALUES: typing.Tuple[TurnDetectionLanguage, ...] = ( + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", + ) + _TURN_DETECTION_LANGUAGES = set(TURN_DETECTION_LANGUAGE_VALUES) + + + def _dump_optional_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + + def _is_turn_detection_language(value: typing.Any) -> bool: + return isinstance(value, str) and value in _TURN_DETECTION_LANGUAGES + + + def _validate_turn_detection_language(value: typing.Any) -> TurnDetectionLanguage: + if not _is_turn_detection_language(value): + raise ValueError(f"Invalid interaction language: {value}") + return value # type: ignore[return-value] + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Deprecated: + The Agent-level ``instructions``, ``greeting``, ``failure_message``, + ``max_history``, and ``greeting_configs`` convenience fields are kept + for compatibility. Configure those values on the LLM or MLLM vendor + instead. + + Examples + -------- + >>> from agora_agent import Agent, OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + greeting_configs: typing.Optional[LlmGreetingConfigs] = None, + pipeline_id: typing.Optional[str] = None, + ): + self._name = name + self._pipeline_id = pipeline_id + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + self._greeting_configs = greeting_configs + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + """Deprecated. Configure system messages on the LLM vendor instead.""" + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + """Deprecated. Configure the greeting on the LLM or MLLM vendor instead.""" + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_greeting_configs(self, configs: LlmGreetingConfigs) -> "Agent": + """Deprecated. Configure greeting playback on the LLM vendor instead.""" + new_agent = self._clone() + new_agent._greeting_configs = configs + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Deprecated. Configure the failure message on the LLM or MLLM vendor instead.""" + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Deprecated. Configure max history on the LLM vendor instead.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def pipeline_id(self) -> typing.Optional[str]: + """Published AI Studio pipeline ID used as this agent's base configuration.""" + return self._pipeline_id + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def greeting_configs(self) -> typing.Optional[LlmGreetingConfigs]: + return self._greeting_configs + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "pipeline_id": self._pipeline_id, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + "greeting_configs": self._greeting_configs, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + uid=_parse_numeric_uid(agent_uid, "agent_uid"), + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + base_kwargs["asr"] = self._resolve_asr_config() + base_kwargs["turn_detection"] = self._resolve_turn_detection_config() + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._greeting_configs is not None: + llm_config["greeting_configs"] = _dump_optional_model(self._greeting_configs) + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + + return StartAgentsRequestProperties(**base_kwargs) + + def _resolve_asr_config(self) -> typing.Dict[str, typing.Any]: + asr_config = dict(self._stt or {}) + asr_config.pop("language", None) + if not asr_config: + asr_config["vendor"] = "ares" + return asr_config + + def _resolve_turn_detection_config(self) -> TurnDetectionConfig: + existing_stt_language = self._stt.get("language") if self._stt is not None else None + existing_turn_detection_language = self._field_value(self._turn_detection, "language") + language = ( + existing_turn_detection_language + if existing_turn_detection_language is not None + else existing_stt_language + if _is_turn_detection_language(existing_stt_language) + else DEFAULT_TURN_DETECTION_LANGUAGE + ) + language = _validate_turn_detection_language(language) + if self._turn_detection is None: + return StartAgentsRequestPropertiesTurnDetection(language=language) + if isinstance(self._turn_detection, dict): + return typing.cast(TurnDetectionConfig, {**self._turn_detection, "language": language}) + return self._copy_model_update(self._turn_detection, {"language": language}) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._pipeline_id = self._pipeline_id + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + new_agent._greeting_configs = self._greeting_configs + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent, GetTurnsOptions, SayOptions, ThinkOptions + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token, _parse_numeric_uid + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, preset, + pipeline_id, expires_in, debug, warn + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(self._agent_uid, "agent_uid"), + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(str(params["agora_uid"]), "avatar agora_uid"), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties( + self, + token_opts: typing.Dict[str, typing.Any], + skip_vendor_validation: bool, + ) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=skip_vendor_validation, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.greeting_configs is not None: + llm["greeting_configs"] = self._dump_model(self._agent.greeting_configs) + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @staticmethod + def _response_pagination(response: typing.Any) -> typing.Any: + if isinstance(response, dict): + return response.get("pagination") + return getattr(response, "pagination", None) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> agent = agent.with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + pipeline_id = self._pipeline_id if self._pipeline_id is not None else self._agent.pipeline_id + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts, skip_vendor_validation=bool(self._preset or pipeline_id)) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> agent = agent.with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + pipeline_id = self._pipeline_id if self._pipeline_id is not None else self._agent.pipeline_id + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts, skip_vendor_validation=bool(self._preset or pipeline_id)) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + tests/custom/test_pipeline_id.py: | + import pytest + + from agora_agent import Agent + + + def dump(value): + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + + class StartResponse: + agent_id = "agent-id" + + + class FakeAgentsClient: + def __init__(self): + self.calls = [] + + def start(self, appid, **kwargs): + self.calls.append({"appid": appid, **kwargs}) + return StartResponse() + + + class FakeAsyncAgentsClient: + def __init__(self): + self.calls = [] + + async def start(self, appid, **kwargs): + self.calls.append({"appid": appid, **kwargs}) + return StartResponse() + + + class FakeClient: + app_id = "appid" + app_certificate = None + + def __init__(self, agents): + self.agents = agents + + + def start_agent(agent, **overrides): + agents = FakeAgentsClient() + client = FakeClient(agents) + options = { + "channel": "channel", + "token": "token", + "agent_uid": "1", + "remote_uids": ["100"], + **overrides, + } + + agent_id = agent.create_session(client, **options).start() + + assert agent_id == "agent-id" + assert len(agents.calls) == 1 + return agents.calls[0] + + + def test_agent_pipeline_id_sends_top_level_pipeline_id() -> None: + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + assert call["appid"] == "appid" + assert call["name"] == "support" + assert call["pipeline_id"] == "studio-pipeline-id" + properties = dump(call["properties"]) + assert properties["channel"] == "channel" + assert properties["token"] == "token" + assert properties["agent_rtc_uid"] == "1" + assert properties["remote_rtc_uids"] == ["100"] + + + def test_session_pipeline_id_overrides_agent_pipeline_id() -> None: + call = start_agent( + Agent(name="support", pipeline_id="agent-pipeline"), + pipeline_id="session-pipeline", + ) + + assert call["pipeline_id"] == "session-pipeline" + + + def test_agent_pipeline_id_skips_missing_vendor_validation() -> None: + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + assert call["pipeline_id"] == "studio-pipeline-id" + + + def test_pipeline_id_is_not_sent_inside_properties() -> None: + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + assert call["pipeline_id"] == "studio-pipeline-id" + assert "pipeline_id" not in dump(call["properties"]) + + + def test_pipeline_id_survives_builder_clone() -> None: + agent = Agent(name="support", pipeline_id="studio-pipeline-id").with_tools(True) + + assert agent.pipeline_id == "studio-pipeline-id" + call = start_agent(agent) + + assert call["pipeline_id"] == "studio-pipeline-id" + assert dump(call["properties"])["advanced_features"] == {"enable_tools": True} + + + @pytest.mark.asyncio + async def test_async_session_uses_agent_pipeline_id() -> None: + agents = FakeAsyncAgentsClient() + client = FakeClient(agents) + agent = Agent(name="support", pipeline_id="studio-pipeline-id") + + agent_id = await agent.create_async_session( + client, + channel="channel", + token="token", + agent_uid="1", + remote_uids=["100"], + ).start() + + assert agent_id == "agent-id" + assert agents.calls[0]["pipeline_id"] == "studio-pipeline-id" + assert "pipeline_id" not in dump(agents.calls[0]["properties"]) + status: unresolved + - id: patch-8e22e6d0 + content_hash: sha256:4baa4d46c129dde02b82a8367fdc1f9217d52267f82eb18f190d230d39a90927 + original_commit: 8e22e6d069e77f4c652e15f2f37945538c88c7c4 + original_message: udpated agent docs + original_author: Hermes (agora) + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/reference/agent.md + patch_content: |+ + From 8e22e6d069e77f4c652e15f2f37945538c88c7c4 Mon Sep 17 00:00:00 2001 + From: "Hermes (agora)" + Date: Tue, 2 Jun 2026 15:36:16 -0400 + Subject: [PATCH] udpated agent docs + + --- + docs/reference/agent.md | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + + diff --git a/docs/reference/agent.md b/docs/reference/agent.md + index 86d4fbd..5693e0b 100644 + --- a/docs/reference/agent.md + +++ b/docs/reference/agent.md + @@ -34,7 +34,6 @@ Agent( + | Parameter | Type | Default | Description | + |---|---|---|---| + | `name` | `Optional[str]` | `None` | Agent name, used as default session name | + -| `pipeline_id` | `Optional[str]` | `None` | Published AI Studio pipeline ID used as this agent's base configuration | + | `instructions` | `Optional[str]` | `None` | Deprecated. Use LLM vendor `system_messages` instead. | + | `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Interaction language and turn detection configuration | + | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | + @@ -48,6 +47,7 @@ Agent( + | `labels` | `Optional[Dict[str, str]]` | `None` | Custom key-value labels (returned in callbacks) | + | `rtc` | `Optional[RtcConfig]` | `None` | RTC media encryption | + | `filler_words` | `Optional[FillerWordsConfig]` | `None` | Filler words while waiting for LLM | + +| `pipeline_id` | `Optional[str]` | `None` | Published AI Studio pipeline ID used as this agent's base configuration | + + `pipeline_id` is an AI Studio base configuration. Explicit Agent config such as `with_llm()`, `with_tts()`, `with_stt()`, `with_mllm()`, `advanced_features`, and other builder options may send fields in `properties` that override the saved pipeline settings. Session-level `pipeline_id` overrides the agent-level value. + + -- + 2.52.0 + + theirs_snapshot: + docs/reference/agent.md: | + --- + sidebar_position: 2 + title: Agent + description: Full API reference for the Python Agent builder class. + --- + + # Agent Reference + + **Import:** `from agora_agent import Agent` + + ## Constructor + + + ```python + Agent( + name: Optional[str] = None, + instructions: Optional[str] = None, + turn_detection: Optional[TurnDetectionConfig] = None, + interruption: Optional[InterruptionConfig] = None, + sal: Optional[SalConfig] = None, + advanced_features: Optional[Dict[str, Any]] = None, + parameters: Optional[SessionParams] = None, + greeting: Optional[str] = None, + failure_message: Optional[str] = None, + max_history: Optional[int] = None, + geofence: Optional[GeofenceConfig] = None, + labels: Optional[Dict[str, str]] = None, + rtc: Optional[RtcConfig] = None, + filler_words: Optional[FillerWordsConfig] = None, + pipeline_id: Optional[str] = None, + ) + ``` + + | Parameter | Type | Default | Description | + |---|---|---|---| + | `name` | `Optional[str]` | `None` | Agent name, used as default session name | + | `instructions` | `Optional[str]` | `None` | Deprecated. Use LLM vendor `system_messages` instead. | + | `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Interaction language and turn detection configuration | + | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | + | `sal` | `Optional[SalConfig]` | `None` | Speech Activity Level configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | `None` | Advanced features dict (e.g., `{'enable_rtm': True}`) | + | `parameters` | `Optional[SessionParams]` | `None` | Additional session parameters | + | `greeting` | `Optional[str]` | `None` | Deprecated. Use LLM/MLLM vendor `greeting_message` instead. | + | `failure_message` | `Optional[str]` | `None` | Deprecated. Use LLM/MLLM vendor `failure_message` instead. | + | `max_history` | `Optional[int]` | `None` | Deprecated. Use LLM vendor `max_history` instead. | + | `geofence` | `Optional[GeofenceConfig]` | `None` | Regional access restriction | + | `labels` | `Optional[Dict[str, str]]` | `None` | Custom key-value labels (returned in callbacks) | + | `rtc` | `Optional[RtcConfig]` | `None` | RTC media encryption | + | `filler_words` | `Optional[FillerWordsConfig]` | `None` | Filler words while waiting for LLM | + | `pipeline_id` | `Optional[str]` | `None` | Published AI Studio pipeline ID used as this agent's base configuration | + + `pipeline_id` is an AI Studio base configuration. Explicit Agent config such as `with_llm()`, `with_tts()`, `with_stt()`, `with_mllm()`, `advanced_features`, and other builder options may send fields in `properties` that override the saved pipeline settings. Session-level `pipeline_id` overrides the agent-level value. + + The Agent-level `instructions`, `greeting`, `failure_message`, `max_history`, and `greeting_configs` fields are compatibility shims. New code should configure those values on the LLM or MLLM vendor because that matches the core request schema. + + ## Builder Methods + + All builder methods return a new `Agent` instance (immutable pattern). + + ### `with_llm(vendor: BaseLLM) -> Agent` + + Set the LLM vendor for cascading flow. + + + ```python + from agora_agent import OpenAI + agent = Agent().with_llm(OpenAI(api_key='your-key', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini')) + ``` + + ### `with_tts(vendor: BaseTTS) -> Agent` + + Set the TTS vendor. Records the vendor's `sample_rate` for avatar validation. + + + ```python + from agora_agent import ElevenLabsTTS + agent = Agent().with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1')) + ``` + + ### `with_stt(vendor: BaseSTT) -> Agent` + + Set the STT (ASR) vendor. + + + ```python + from agora_agent import DeepgramSTT + agent = Agent().with_stt(DeepgramSTT(api_key='your-key', language='en-US')) + ``` + + ### `with_mllm(vendor: BaseMLLM) -> Agent` + + Set the MLLM vendor for multimodal flow. Calling `with_mllm()` automatically sets `mllm.enable = True`. MLLM sessions do not require TTS, STT, or LLM vendors. + + + ```python + from agora_agent import OpenAIRealtime + agent = Agent().with_mllm(OpenAIRealtime(api_key='your-key')) + ``` + + ### `with_avatar(vendor: BaseAvatar) -> Agent` + + Set the avatar vendor for the cascading ASR + LLM + TTS pipeline. Avatars are not supported when MLLM is enabled — combining `with_mllm()` and an enabled `with_avatar()` is rejected at `to_properties()` and `AgentSession.start()`. A disabled avatar (`enable=False`) is allowed alongside MLLM. + + Raises `ValueError` if the TTS sample rate does not match the avatar's `required_sample_rate`. + + + ```python + from agora_agent import HeyGenAvatar + agent = agent.with_avatar(HeyGenAvatar(api_key='your-key', quality='medium', agora_uid='2')) + ``` + + **Raises:** `ValueError` — `"Avatar requires TTS sample rate of {required} Hz, but TTS is configured with {actual} Hz. Please update your TTS sample_rate to {required}."` + + ### `with_turn_detection(config: TurnDetectionConfig) -> Agent` + + Override cascading-flow turn detection settings. Use `language` for the Agora interaction language, `config.start_of_speech` and `config.end_of_speech` for SOS/EOS detection, `with_interruption()` for interruption behavior, and MLLM vendor `turn_detection` for MLLM turn detection. + + Pause-state detection is configured under semantic end-of-speech: + + ```python + agent = agent.with_turn_detection({ + "mode": "default", + "config": { + "end_of_speech": { + "mode": "semantic", + "semantic_config": { + "pause_state_enabled": True, + }, + }, + }, + }) + ``` + + ### `with_interruption(config: InterruptionConfig) -> Agent` + + Configure unified interruption behavior using the top-level `interruption` object. Use this for `start_of_speech` and `keywords` interruption modes. + + ### `with_instructions(instructions: str) -> Agent` + + Deprecated. Configure `system_messages` on the LLM vendor instead. + + ### `with_greeting(greeting: str) -> Agent` + + Deprecated. Configure `greeting_message` on the LLM or MLLM vendor instead. + + ### `with_name(name: str) -> Agent` + + Override the agent name. + + ### `with_sal(config: SalConfig) -> Agent` + + Set SAL (Selective Attention Locking) configuration. + + ### `with_advanced_features(features: AdvancedFeatures) -> Agent` + + Set advanced features (e.g. `{'enable_rtm': True}`). + + When `enable_rtm=True`, AgentKit defaults `parameters.data_channel` to `"rtm"` unless you explicitly set another data channel. + + ### `with_tools(enabled: bool = True) -> Agent` + + Enable or disable MCP tool invocation by setting `advanced_features.enable_tools`. + + ### `with_parameters(parameters: SessionParams) -> Agent` + + Set session parameters (silence config, farewell config, data channel, audio scenario, etc.). + + ### `with_audio_scenario(audio_scenario: ParametersAudioScenario) -> Agent` + + Set `parameters.audio_scenario` without replacing existing session parameters. + + ### `with_failure_message(message: str) -> Agent` + + Deprecated. Configure `failure_message` on the LLM or MLLM vendor instead. + + ### `with_max_history(max_history: int) -> Agent` + + Deprecated. Configure `max_history` on the LLM vendor instead. + + ### `with_geofence(geofence: GeofenceConfig) -> Agent` + + Set geofence configuration (restricts backend server regions). + + ### `with_labels(labels: Dict[str, str]) -> Agent` + + Set custom labels (key-value pairs returned in notification callbacks). + + ### `with_rtc(rtc: RtcConfig) -> Agent` + + Set RTC configuration. + + ### `with_filler_words(filler_words: FillerWordsConfig) -> Agent` + + Set filler words configuration (played while waiting for LLM response). + + ## `create_session()` + + + ```python + create_session( + client: Any, + channel: str, + agent_uid: str, + remote_uids: List[str], + name: Optional[str] = None, + token: Optional[str] = None, + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + preset: Optional[Union[str, Sequence[str]]] = None, + pipeline_id: Optional[str] = None, + expires_in: Optional[int] = None, + ) -> AgentSession + ``` + + Creates an `AgentSession` bound to the given client and channel. + + | Parameter | Type | Required | Description | + |---|---|---|---| + | `client` | `Agora` or `AsyncAgora` | Yes | Authenticated client | + | `channel` | `str` | Yes | Channel name | + | `agent_uid` | `str` | Yes | UID for the agent | + | `remote_uids` | `List[str]` | Yes | UIDs of remote participants | + | `name` | `Optional[str]` | No | Session name (defaults to agent name) | + | `token` | `Optional[str]` | No | Pre-built RTC+RTM token | + | `expires_in` | `Optional[int]` | No | Token lifetime in seconds (default: `86400` = 24 h, Agora max). Only applies when the token is auto-generated. Use `expires_in_hours()` or `expires_in_minutes()` for clarity. Valid range: 1–86400. | + | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | + | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | + | `preset` | `Optional[Union[str, Sequence[str]]]` | No | Advanced preset value for project-specific routing | + | `pipeline_id` | `Optional[str]` | No | Published AI Studio pipeline ID for this session. Overrides `agent.pipeline_id`. | + + `pipeline_id` is sent as the top-level `/join` field `pipeline_id`, not inside `properties`. + + **Returns:** `AgentSession` + + ## `to_properties()` + + Converts the agent configuration into a `StartAgentsRequestProperties` object for the Agora API. Called internally by `AgentSession.start()`. + + + ```python + to_properties( + channel: str, + agent_uid: str, + remote_uids: List[str], + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + token: Optional[str] = None, + app_id: Optional[str] = None, + app_certificate: Optional[str] = None, + expires_in: Optional[int] = None, + ) -> StartAgentsRequestProperties + ``` + + **Raises:** `ValueError` if neither `token` nor `app_id`+`app_certificate` is provided, or if required vendors (LLM, TTS) are missing in cascading mode. + + ## Properties + + | Property | Type | Description | + |---|---|---| + | `name` | `Optional[str]` | Agent name | + | `instructions` | `Optional[str]` | Deprecated Agent-level system prompt | + | `greeting` | `Optional[str]` | Deprecated Agent-level greeting message | + | `failure_message` | `Optional[str]` | Deprecated Agent-level failure message | + | `max_history` | `Optional[int]` | Deprecated Agent-level max history | + | `llm` | `Optional[Dict[str, Any]]` | LLM config dict (from `to_config()`) | + | `tts` | `Optional[Dict[str, Any]]` | TTS config dict | + | `stt` | `Optional[Dict[str, Any]]` | STT config dict | + | `mllm` | `Optional[Dict[str, Any]]` | MLLM config dict | + | `avatar` | `Optional[Dict[str, Any]]` | Avatar config dict | + | `turn_detection` | `Optional[TurnDetectionConfig]` | Interaction language and turn detection settings | + | `sal` | `Optional[SalConfig]` | SAL configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | Advanced features | + | `parameters` | `Optional[SessionParams]` | Session parameters | + | `geofence` | `Optional[GeofenceConfig]` | Geofence configuration | + | `labels` | `Optional[Dict[str, str]]` | Custom labels | + | `rtc` | `Optional[RtcConfig]` | RTC configuration | + | `filler_words` | `Optional[FillerWordsConfig]` | Filler words configuration | + | `config` | `Dict[str, Any]` | Full configuration dict | + + ## Type aliases + + Public aliases over Fern-generated types: `LlmConfig`, `SttConfig`, `AsrConfig` (= `SttConfig`), `MllmConfig`, `AvatarConfig`, session/conversation types, and think types (`ThinkOnListeningAction`, etc.). + + Think value constants: `ThinkOnListeningActionInject`, `ThinkOnListeningActionInterrupt`, `ThinkOnListeningActionIgnore`, `ThinkOnThinkingActionInterrupt`, `ThinkOnThinkingActionIgnore`, `ThinkOnSpeakingActionInterrupt`, `ThinkOnSpeakingActionIgnore`. + status: unresolved + - id: patch-bed29b6b + content_hash: sha256:8008d9c33a194a48ef317868953c26d5b03ede60c23743b4249260894c0f6417 + original_commit: bed29b6b7d4d08480a8510b26b5e21d1ef234cc9 + original_message: "chore: bump Python packages to 2.1.0" + original_author: digitallysavvy + base_generation: b66d871314ca0e5929cb9c9095949a7fd5e856a7 + files: + - compat/agora-agent-server-sdk/pyproject.toml + patch_content: | + diff --git a/compat/agora-agent-server-sdk/pyproject.toml b/compat/agora-agent-server-sdk/pyproject.toml + index ac93128..468294b 100644 + --- a/compat/agora-agent-server-sdk/pyproject.toml + +++ b/compat/agora-agent-server-sdk/pyproject.toml + @@ -3,7 +3,7 @@ name = "agora-agent-server-sdk" + + [tool.poetry] + name = "agora-agent-server-sdk" + -version = "v2.0.0" + +version = "v2.1.0" + description = "Compatibility shim for the renamed agora-agents package." + readme = "README.md" + authors = [] + @@ -35,7 +35,7 @@ Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-pyth + + [tool.poetry.dependencies] + python = "^3.8" + -agora-agents = ">=2.0.0,<3.0.0" + +agora-agents = ">=2.1.0,<3.0.0" + + [build-system] + requires = ["poetry-core"] + theirs_snapshot: + compat/agora-agent-server-sdk/pyproject.toml: | + [project] + name = "agora-agent-server-sdk" + + [tool.poetry] + name = "agora-agent-server-sdk" + version = "v2.1.0" + description = "Compatibility shim for the renamed agora-agents package." + readme = "README.md" + authors = [] + keywords = [] + + classifiers = [ + "Intended Audience :: Developers", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Operating System :: OS Independent", + "Operating System :: POSIX", + "Operating System :: MacOS", + "Operating System :: POSIX :: Linux", + "Operating System :: Microsoft :: Windows", + "Topic :: Software Development :: Libraries :: Python Modules", + "Typing :: Typed" + ] + packages = [ + { include = "agora_agent_server_sdk_compat", from = "src"} + ] + + [tool.poetry.urls] + Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python' + + [tool.poetry.dependencies] + python = "^3.8" + agora-agents = ">=2.1.0,<3.0.0" + + [build-system] + requires = ["poetry-core"] + build-backend = "poetry.core.masonry.api" + user_owned: true diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index cfa8580..48b9053 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -336,6 +336,7 @@ Use `turn_detection.language` for Agora interaction language; it defaults to `en | `api_key` | `str` | BYOK only | `None` | Deepgram API key. Optional only for Agora-managed `nova-2` and `nova-3`. | | `model` | `str` | No | `None` | Model (e.g., `nova-2`) | | `language` | `str` | No | `None` | Language code (e.g., `en-US`) | +| `interaction_language` | `str` | No | `None` | Agora `asr.language` override | | `smart_format` | `bool` | No | `None` | Enable smart formatting | | `punctuation` | `bool` | No | `None` | Enable punctuation | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | diff --git a/scripts/check_release_workflow.py b/scripts/check_release_workflow.py new file mode 100644 index 0000000..1a6e065 --- /dev/null +++ b/scripts/check_release_workflow.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 + +import re +import sys +from pathlib import Path +from typing import NoReturn + + +def fail(message: str) -> NoReturn: + print(message, file=sys.stderr) + raise SystemExit(1) + + +def read_version(path: str) -> str: + text = Path(path).read_text() + match = re.search(r'^version\s*=\s*"v?([^"]+)"', text, re.M) + if not match: + fail(f"version not found in {path}") + return match.group(1) + + +def read_compat_dependency(path: str) -> str: + text = Path(path).read_text() + match = re.search(r'^agora-agents\s*=\s*"([^"]+)"', text, re.M) + if not match: + fail(f"agora-agents dependency not found in {path}") + return match.group(1) + + +root_version = read_version("pyproject.toml") +compat_pyproject = "compat/agora-agent-server-sdk/pyproject.toml" +compat_version = read_version(compat_pyproject) +compat_dependency = read_compat_dependency(compat_pyproject) + +if compat_version != root_version: + fail(f"Compat package version ({compat_version}) must match root package version ({root_version}).") + +expected_dependency = f">={root_version},<3.0.0" +if compat_dependency != expected_dependency: + fail(f"Compat package dependency on agora-agents ({compat_dependency}) must be {expected_dependency}.") + +release_workflow = Path(".github/workflows/release.yml").read_text() +required_workflow_markers = [ + ("contents: write", "release workflow must have contents: write so it can create GitHub releases"), + ("gh release create", "release workflow must create a GitHub release when one does not exist"), + ("gh release edit", "release workflow must update an existing GitHub release"), + ("release_notes.md", "release workflow must generate and use a release notes file"), +] + +for marker, message in required_workflow_markers: + if marker not in release_workflow: + fail(message) + +print("Release metadata and workflow checks passed.") diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index 6275f04..ecf01c6 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -57,6 +57,8 @@ from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule from ..types.tts import Tts +from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule +from ..types.tts import Tts from ..types.asr import Asr from ..types.llm import Llm from ..types.llm_style import LlmStyle as GeneratedLlmStyle @@ -536,6 +538,23 @@ def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent ) return new_agent + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + def with_failure_message(self, message: str) -> "Agent": """Deprecated. Configure the failure message on the LLM or MLLM vendor instead.""" new_agent = self._clone() diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py index dbff562..dca9ee8 100644 --- a/src/agora_agent/agentkit/agent_session.py +++ b/src/agora_agent/agentkit/agent_session.py @@ -24,6 +24,7 @@ is_generic_avatar, is_heygen_avatar, is_live_avatar_avatar, + is_rtc_avatar, validate_avatar_config, validate_tts_sample_rate, ) diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py index 50bdd08..f48098c 100644 --- a/src/agora_agent/agentkit/vendors/avatar.py +++ b/src/agora_agent/agentkit/vendors/avatar.py @@ -177,6 +177,49 @@ def to_config(self) -> Dict[str, Any]: return {"enable": enable, "vendor": "generic", "params": params} +class GenericAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Generic avatar provider API key") + api_base_url: str = Field(..., description="Avatar provider API base URL") + avatar_id: str = Field(..., description="Avatar ID") + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + +class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "api_base_url": self.options.api_base_url, + "avatar_id": self.options.avatar_id, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_appid is not None: + params["agora_appid"] = self.options.agora_appid + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.agora_channel is not None: + params["agora_channel"] = self.options.agora_channel + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "generic", "params": params} + + class AnamAvatarOptions(BaseModel): model_config = ConfigDict(extra="forbid") diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py index 9156a01..5dd822d 100644 --- a/src/agora_agent/agentkit/vendors/llm.py +++ b/src/agora_agent/agentkit/vendors/llm.py @@ -1,7 +1,10 @@ -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union from pydantic import BaseModel, ConfigDict, Field, model_validator +from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + StartAgentsRequestPropertiesLlmGreetingConfigs, +) from .base import BaseLLM LlmGreetingConfigs = Dict[str, Any] diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py index 236a494..6a260d8 100644 --- a/src/agora_agent/agentkit/vendors/mllm.py +++ b/src/agora_agent/agentkit/vendors/mllm.py @@ -1,3 +1,4 @@ +import warnings from typing import Any, Dict, List, Optional from pydantic import BaseModel, ConfigDict, Field diff --git a/src/agora_agent/agentkit/vendors/stt.py b/src/agora_agent/agentkit/vendors/stt.py index e5117b0..bb222a9 100644 --- a/src/agora_agent/agentkit/vendors/stt.py +++ b/src/agora_agent/agentkit/vendors/stt.py @@ -89,6 +89,7 @@ class SpeechmaticsSTTOptions(BaseModel): api_key: str = Field(..., description="Speechmatics API key") language: str = Field(..., description="Language code (e.g., en, es, fr)") + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") model: Optional[str] = Field(default=None, description="Model name") uri: Optional[str] = Field(default=None, description="Speechmatics streaming WebSocket URL") additional_params: Optional[Dict[str, Any]] = Field(default=None) @@ -124,6 +125,7 @@ class DeepgramSTTOptions(BaseModel): api_key: Optional[str] = Field(default=None, description="Deepgram API key") model: Optional[str] = Field(default=None, description="Model (e.g., nova-2, enhanced, base)") language: Optional[str] = Field(default=None, description="Language code (e.g., en-US)") + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") smart_format: Optional[bool] = Field(default=None, description="Enable smart formatting") punctuation: Optional[bool] = Field(default=None, description="Enable punctuation") additional_params: Optional[Dict[str, Any]] = Field(default=None) @@ -353,6 +355,7 @@ class SarvamSTTOptions(BaseModel): api_key: str = Field(..., description="Sarvam API key") language: str = Field(..., description="Language code (e.g., en, hi, ta)") + interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") model: Optional[str] = Field(default=None, description="Model name") additional_params: Optional[Dict[str, Any]] = Field(default=None) diff --git a/src/agora_agent/agentkit/vendors/tts.py b/src/agora_agent/agentkit/vendors/tts.py index a052ea5..3986986 100644 --- a/src/agora_agent/agentkit/vendors/tts.py +++ b/src/agora_agent/agentkit/vendors/tts.py @@ -139,6 +139,8 @@ def to_config(self) -> Dict[str, Any]: } if self.options.api_key is not None: params["api_key"] = self.options.api_key + if self.options.base_url is not None: + params["base_url"] = self.options.base_url params["base_url"] = self.options.base_url params["model"] = self.options.model elif self.options.model is not None: @@ -254,6 +256,8 @@ def to_config(self) -> Dict[str, Any]: "voice": self.options.voice_id, "engine": self.options.engine, } + if self.options.engine is not None: + params["engine"] = self.options.engine result: Dict[str, Any] = {"vendor": "amazon", "params": params} if self.options.skip_patterns is not None: @@ -392,6 +396,8 @@ def to_config(self) -> Dict[str, Any]: "reference_id": self.options.reference_id, "backend": self.options.backend, } + if self.options.backend is not None: + params["backend"] = self.options.backend result: Dict[str, Any] = {"vendor": "fishaudio", "params": params} if self.options.skip_patterns is not None: diff --git a/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py b/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py index 40dbb02..fb58a36 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py @@ -5,6 +5,7 @@ import pydantic from ...core.pydantic_utilities import IS_PYDANTIC_V2 from ...core.unchecked_base_model import UncheckedBaseModel +from ...types.asr_language import AsrLanguage from .start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig from .start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness from .start_agents_request_properties_turn_detection_interrupt_mode import ( @@ -18,6 +19,11 @@ class StartAgentsRequestPropertiesTurnDetection(UncheckedBaseModel): Conversation turn detection settings. Controls the logic for voice activity detection and conversation turn determination. This object has no effect when `mllm.enable` is true; use `mllm.turn_detection` instead. """ + language: typing.Optional[AsrLanguage] = pydantic.Field(default=None) + """ + BCP-47 language tag identifying the primary language used for agent interaction. + """ + mode: typing.Optional[typing.Literal["default"]] = pydantic.Field(default=None) """ Conversation turn detection mode: diff --git a/tests/custom/test_agentkit_agent.py b/tests/custom/test_agentkit_agent.py new file mode 100644 index 0000000..9719b04 --- /dev/null +++ b/tests/custom/test_agentkit_agent.py @@ -0,0 +1,298 @@ +from agora_agent.agentkit import ( + Agent, + AvatarConfig, + AvatarVendor, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, + SttConfig, + SttVendor, + TtsConfig, +) +import pytest + +from agora_agent.agentkit.vendors import ( + AkoolAvatar, + ElevenLabsTTS, + LiveAvatarAvatar, + OpenAI, + OpenAIRealtime, +) + + +def _parameter(config, key): + parameters = config["parameters"] + if isinstance(parameters, dict): + return parameters[key] + return getattr(parameters, key) + + +class _CopyOnlyModel: + def __init__(self, **values): + self.values = values + + def copy(self, update=None): + return _CopyOnlyModel(**{**self.values, **(update or {})}) + + +def test_generated_core_aliases_are_public(): + assert LlmConfig is not None + assert LlmStyle is not None + assert SttConfig is not None + assert SttVendor is not None + assert TtsConfig is not None + assert MllmConfig is not None + assert MllmVendor is not None + assert AvatarConfig is not None + assert AvatarVendor is not None + + +def test_model_copy_helper_supports_pydantic_v1_copy_api(): + copied = Agent._copy_model_update(_CopyOnlyModel(enable_rtm=True), {"data_channel": "rtm"}) # noqa: SLF001 + + assert copied.values == {"enable_rtm": True, "data_channel": "rtm"} + + +def test_with_audio_scenario_sets_session_parameter(): + agent = Agent(name="test").with_audio_scenario("chorus") + + assert _parameter(agent.config, "audio_scenario") == "chorus" + + +def test_with_audio_scenario_preserves_existing_parameters(): + agent = Agent(name="test", parameters={"enable_metrics": True}).with_audio_scenario( + "chorus" + ) + + assert _parameter(agent.config, "enable_metrics") is True + assert _parameter(agent.config, "audio_scenario") == "chorus" + + +def test_enable_rtm_defaults_data_channel_to_rtm(): + properties = Agent(name="test", advanced_features={"enable_rtm": True}).to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + skip_vendor_validation=True, + ) + + assert properties.parameters.data_channel == "rtm" + + +def test_enable_rtm_preserves_explicit_data_channel(): + properties = Agent( + name="test", + advanced_features={"enable_rtm": True}, + parameters={"data_channel": "datastream"}, + ).to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + skip_vendor_validation=True, + ) + + assert properties.parameters.data_channel == "datastream" + + +def test_agent_level_llm_fields_override_vendor_defaults(): + agent = ( + Agent(name="test") + .with_llm( + OpenAI( + api_key="llm-key", + greeting_message="vendor greeting", + failure_message="vendor failure", + max_history=1, + ) + ) + .with_tts(ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + + properties = agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + ) + + assert properties.llm.greeting_message == "agent greeting" + assert properties.llm.failure_message == "agent failure" + assert properties.llm.max_history == 2 + + +def test_avatar_sample_rate_validation_works_when_tts_added_after_avatar(): + agent = Agent(name="test").with_avatar( + LiveAvatarAvatar(api_key="avatar-key", quality="medium", agora_uid="2") + ) + + with pytest.raises(ValueError, match="24000"): + agent.with_tts( + ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice", sample_rate=16000) + ) + + +def test_avatar_sample_rate_validation_uses_wrapper_sample_rate(): + agent = ( + Agent(name="test") + .with_avatar(AkoolAvatar(api_key="avatar-key")) + .with_tts( + ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice", sample_rate=16000) + ) + ) + + assert agent.tts_sample_rate == 16000 + + +def test_with_mllm_removes_deprecated_advanced_features_enable_mllm(): + properties = ( + Agent( + name="test", + advanced_features={"enable_mllm": True, "enable_rtm": True}, + greeting="hello from agent", + failure_message="try again", + max_history=5, + ) + .with_mllm(OpenAIRealtime(api_key="openai-key")) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None + assert properties.mllm.enable is True + assert properties.mllm.greeting_message == "hello from agent" + assert properties.mllm.failure_message == "try again" + mllm_dump = properties.mllm.model_dump(exclude_none=True) + assert "max_history" not in mllm_dump + assert properties.advanced_features is not None + af_dump = properties.advanced_features.model_dump(exclude_none=True) + assert "enable_mllm" not in af_dump + assert af_dump.get("enable_rtm") is True + + +def test_to_properties_rejects_mllm_with_enabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + + +def test_to_properties_mllm_with_avatar_fires_before_token_generation(): + """The guard must fire before the token-generation step so callers get a + clear, actionable error even when app_id/app_certificate are empty. + """ + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + app_id="", + app_certificate="", + ) + + +def test_to_properties_rejects_mllm_with_default_enabled_avatar(): + """Avatar with no `enable` field should be treated as enabled.""" + agent = Agent(name="test").with_mllm(OpenAIRealtime(api_key="mllm-key")) + agent._avatar = { # noqa: SLF001 + "vendor": "liveavatar", + "params": { + "api_key": "avatar-key", + "quality": "high", + "agora_uid": "200", + "agora_token": "avatar-token", + }, + } + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + + +def test_to_properties_allows_mllm_with_disabled_avatar_and_no_tts(): + properties = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + enable=False, + ) + ) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None and properties.mllm.enable is True + assert properties.tts is None + assert properties.llm is None + assert properties.asr is None + assert properties.avatar is not None and properties.avatar.enable is False + + +def test_to_properties_mllm_without_tts_or_llm_succeeds(): + properties = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None and properties.mllm.enable is True + assert properties.tts is None + assert properties.llm is None + assert properties.asr is None + assert properties.avatar is None diff --git a/tests/custom/test_agentkit_session.py b/tests/custom/test_agentkit_session.py new file mode 100644 index 0000000..198fcd0 --- /dev/null +++ b/tests/custom/test_agentkit_session.py @@ -0,0 +1,383 @@ +from types import SimpleNamespace + +import pytest + +from agora_agent.agentkit import Agent, AgentSession +from agora_agent.agentkit.vendors import ( + ElevenLabsTTS, + GenericAvatar, + LiveAvatarAvatar, + OpenAI, + OpenAIRealtime, + RimeTTS, +) +from agora_agent.agents.types.get_turns_agents_response import GetTurnsAgentsResponse + + +APP_ID = "0" * 32 +APP_CERTIFICATE = "1" * 32 + + +class _Agents: + def __init__(self): + self.calls = [] + self.start_calls = [] + + def start(self, app_id, name, properties, preset=None, pipeline_id=None, request_options=None): + self.start_calls.append((app_id, name, properties, preset, pipeline_id, request_options)) + return SimpleNamespace(agent_id="agent-1") + + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + self.calls.append((app_id, agent_id, page_index, page_size, request_options)) + is_last_page = page_index != 1 + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=2, + pagination={ + "page_index": page_index or 1, + "total_pages": 2, + "is_last_page": is_last_page, + }, + turns=[{"turn_id": float(page_index or 1)}], + ) + + +class _Client: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _Agents() + self.agent_management = object() + + +def _session(agent, warn=None): + return AgentSession( + client=_Client(), + agent=agent, + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + warn=warn, + ) + + +def test_generic_avatar_enrichment_adds_session_context_and_token(): + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + ) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + params = properties["avatar"]["params"] + assert params["agora_appid"] == APP_ID + assert params["agora_channel"] == "room" + assert params["agora_token"] + assert params["agora_token"] != properties["token"] + + +def test_generic_avatar_empty_session_fields_are_filled(): + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + agora_appid="", + agora_channel="", + agora_token="", + ) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + params = properties["avatar"]["params"] + assert params["agora_appid"] == APP_ID + assert params["agora_channel"] == "room" + assert params["agora_token"] + + +def test_avatar_uid_matching_agent_uid_warns(): + warnings = [] + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="1", + ) + ) + session = _session(agent, warn=warnings.append) + + session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert any("matches agent_rtc_uid" in warning for warning in warnings) + + +def test_session_start_properties_agent_level_llm_fields_override_vendor_defaults(): + agent = ( + Agent(name="test") + .with_llm( + OpenAI( + api_key="llm-key", + greeting_message="vendor greeting", + failure_message="vendor failure", + max_history=1, + ) + ) + .with_tts(ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["llm"]["greeting_message"] == "agent greeting" + assert properties["llm"]["failure_message"] == "agent failure" + assert properties["llm"]["max_history"] == 2 + + +def test_session_start_properties_applies_mllm_agent_level_defaults(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["mllm"]["greeting_message"] == "agent greeting" + assert properties["mllm"]["failure_message"] == "agent failure" + assert "max_history" not in properties["mllm"] + + +def test_session_start_properties_preserves_mllm_vendor_defaults(): + agent = ( + Agent(name="test") + .with_mllm( + OpenAIRealtime( + api_key="mllm-key", + greeting_message="vendor greeting", + failure_message="vendor failure", + ) + ) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["mllm"]["greeting_message"] == "vendor greeting" + assert properties["mllm"]["failure_message"] == "vendor failure" + assert "max_history" not in properties["mllm"] + + +def test_session_start_allows_mllm_without_tts(): + agent = Agent(name="test").with_mllm(OpenAIRealtime(api_key="mllm-key")) + session = _session(agent) + + assert session.start() == "agent-1" + + +def test_session_start_rejects_mllm_with_enabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + session = _session(agent) + + with pytest.raises(ValueError, match="cascading"): + session.start() + assert session._client.agents.start_calls == [] # noqa: SLF001 + + +def test_session_start_allows_mllm_with_disabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + enable=False, + ) + ) + ) + session = _session(agent) + + assert session.start() == "agent-1" + + +def test_avatar_sample_rate_validation_uses_serialized_vendor_keys(): + warnings = [] + agent = ( + Agent(name="test") + .with_avatar(LiveAvatarAvatar(api_key="avatar-key", quality="medium", agora_uid="2")) + .with_tts(RimeTTS(key="tts-key", speaker="speaker", sampling_rate=24000)) + ) + session = _session(agent, warn=warnings.append) + + session._validate_avatar_config() # noqa: SLF001 + + assert warnings == [] + + +def test_avatar_user_token_is_not_overwritten(): + agent = Agent(name="test").with_avatar( + LiveAvatarAvatar( + api_key="live-key", + quality="medium", + agora_uid="2", + agora_token="user-token", + ) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["avatar"]["params"]["agora_token"] == "user-token" + + +def test_get_turns_forwards_pagination_args(): + session = _session(Agent(name="test")) + session._agent_id = "agent-id" # noqa: SLF001 + + session.get_turns(page_index=3, page_size=25) + + assert session._client.agents.calls[-1][:4] == (APP_ID, "agent-id", 3, 25) # noqa: SLF001 + + +def test_get_all_turns_aggregates_pages(): + session = _session(Agent(name="test")) + session._agent_id = "agent-id" # noqa: SLF001 + + response = session.get_all_turns(page_size=1) + + assert [turn.turn_id for turn in response.turns] == [1.0, 2.0] + assert response.pagination.page_index == 2 + + +def test_get_all_turns_raises_when_pagination_does_not_advance(): + class _StuckAgents: + def __init__(self): + self.calls = 0 + + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + self.calls += 1 + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=2, + pagination={"page_index": 1, "is_last_page": False}, + turns=[{"turn_id": 1.0}], + ) + + class _StuckClient: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _StuckAgents() + self.agent_management = object() + + session = AgentSession( + client=_StuckClient(), + agent=Agent(name="test"), + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + ) + session._agent_id = "agent-id" # noqa: SLF001 + + with pytest.raises(RuntimeError, match="did not advance"): + session.get_all_turns(page_size=1) + + +def test_get_all_turns_raises_when_pagination_metadata_missing(): + class _NoMetaAgents: + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=1, + pagination={"is_last_page": False}, + turns=[{"turn_id": 1.0}], + ) + + class _NoMetaClient: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _NoMetaAgents() + self.agent_management = object() + + session = AgentSession( + client=_NoMetaClient(), + agent=Agent(name="test"), + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + ) + session._agent_id = "agent-id" # noqa: SLF001 + + with pytest.raises(RuntimeError, match="cannot continue"): + session.get_all_turns(page_size=1) diff --git a/tests/custom/test_agentkit_vendors.py b/tests/custom/test_agentkit_vendors.py new file mode 100644 index 0000000..8473821 --- /dev/null +++ b/tests/custom/test_agentkit_vendors.py @@ -0,0 +1,122 @@ +import pytest +from pydantic import ValidationError + +from agora_agent.agentkit import LlmGreetingConfigs +import warnings + +from agora_agent.agentkit.vendors import GenericAvatar, OpenAI, OpenAIRealtime, XaiGrok, XaiRealtime + + +def test_xai_grok_serializes_v27_shape_without_style(): + config = XaiGrok( + api_key="xai-key", + voice="eve", + language="en", + sample_rate=24000, + output_modalities=["audio", "text"], + params={"temperature": 0.2}, + ).to_config() + + assert config["vendor"] == "xai" + assert config["url"] == "wss://api.x.ai/v1/realtime" + assert config["api_key"] == "xai-key" + assert config["params"] == { + "temperature": 0.2, + "voice": "eve", + "language": "en", + "sample_rate": 24000, + } + assert config["output_modalities"] == ["audio", "text"] + assert "style" not in config + + +def test_xai_grok_emits_params_even_when_empty(): + assert XaiGrok(api_key="xai-key").to_config()["params"] == {} + + +def test_xai_realtime_deprecated_alias_emits_same_vendor(): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always", DeprecationWarning) + config = XaiRealtime(api_key="xai-key").to_config() + assert len(caught) == 1 + assert issubclass(caught[0].category, DeprecationWarning) + assert config["vendor"] == "xai" + + +def test_mllm_rejects_fields_not_in_core_contract(): + with pytest.raises(ValidationError): + OpenAIRealtime(api_key="openai-key", predefined_tools=["_publish_message"]) + + with pytest.raises(ValidationError): + XaiGrok(api_key="xai-key", max_history=10) + + +def test_generic_avatar_omits_session_enriched_fields_when_unset(): + config = GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + ).to_config() + + assert config == { + "enable": True, + "vendor": "generic", + "params": { + "api_key": "avatar-key", + "api_base_url": "https://avatar.example.com", + "avatar_id": "avatar-1", + "agora_uid": "2", + }, + } + + +def test_vertex_ai_explicit_fields_override_additional_params(): + from agora_agent.agentkit.vendors import VertexAI + + config = VertexAI( + model="explicit-model", + project_id="explicit-project", + location="explicit-region", + adc_credentials_string="{}", + additional_params={ + "model": "should-be-overridden", + "project_id": "should-be-overridden", + "location": "should-be-overridden", + "adc_credentials_string": "should-be-overridden", + "extra_key": "kept", + }, + ).to_config() + + assert config["vendor"] == "vertexai" + assert config["params"]["model"] == "explicit-model" + assert config["params"]["project_id"] == "explicit-project" + assert config["params"]["location"] == "explicit-region" + assert config["params"]["adc_credentials_string"] == "{}" + assert config["params"]["extra_key"] == "kept" + + +def test_gemini_live_explicit_fields_override_additional_params(): + from agora_agent.agentkit.vendors import GeminiLive + + config = GeminiLive( + api_key="key", + model="explicit-model", + additional_params={ + "model": "should-be-overridden", + "extra_key": "kept", + }, + ).to_config() + + assert config["params"]["model"] == "explicit-model" + assert config["params"]["extra_key"] == "kept" + + +def test_llm_greeting_configs_interruptable_serializes(): + config = OpenAI( + api_key="openai-key", + greeting_configs=LlmGreetingConfigs(mode="single_first", interruptable=False), + ).to_config() + + assert config["greeting_configs"]["mode"] == "single_first" + assert config["greeting_configs"]["interruptable"] is False From f66343802c61a82a61468fa65d2756a8ef467da3 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Thu, 4 Jun 2026 16:55:09 -0400 Subject: [PATCH 05/14] fix(agentkit): OpenAISTT required fields, Deepgram keyterm, VertexAILLM URL, LLM precedence, TurnDetection language cleanup - OpenAISTT.to_config: change default model to gpt-4o-mini-transcribe; validate that model, prompt, and language are present after applying input_audio_transcription overrides - DeepgramSTTOptions: add keyterm field emitted as params.keyterm - OpenAITTSOptions: strengthen empty check from to for base_url/model - VertexAILLM.to_config: construct correct Vertex AI endpoint URL from location/project_id/model - _resolve_llm_config: vendor-provided keys now win over agent-level convenience fields - Remove bare en from TurnDetectionLanguage and TURN_DETECTION_LANGUAGE_VALUES --- src/agora_agent/agentkit/agent.py | 14 +++++--------- src/agora_agent/agentkit/vendors/llm.py | 8 +++++++- src/agora_agent/agentkit/vendors/stt.py | 12 +++++++++++- src/agora_agent/agentkit/vendors/tts.py | 2 +- 4 files changed, 24 insertions(+), 12 deletions(-) diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index f7ba770..7447d0b 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -221,7 +221,6 @@ class SessionOptions(typing_extensions.TypedDict, total=False): "zh-HK", "zh-TW", "nl-NL", - "en", "en-IN", "en-US", "fil-PH", @@ -258,7 +257,6 @@ class SessionOptions(typing_extensions.TypedDict, total=False): "zh-HK", "zh-TW", "nl-NL", - "en", "en-IN", "en-US", "fil-PH", @@ -944,17 +942,15 @@ def to_properties( def _resolve_llm_config(self) -> typing.Dict[str, typing.Any]: llm_config = dict(self._llm or {}) - # Agent-level fields take priority over the vendor's defaults. - # This matches the TS SDK where agent-level values override vendor config. - if self._instructions is not None: + if self._instructions is not None and "system_messages" not in llm_config: llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] - if self._greeting is not None: + if self._greeting is not None and "greeting_message" not in llm_config: llm_config["greeting_message"] = self._greeting - if self._greeting_configs is not None: + if self._greeting_configs is not None and "greeting_configs" not in llm_config: llm_config["greeting_configs"] = _dump_optional_model(self._greeting_configs) - if self._failure_message is not None: + if self._failure_message is not None and "failure_message" not in llm_config: llm_config["failure_message"] = self._failure_message - if self._max_history is not None: + if self._max_history is not None and "max_history" not in llm_config: llm_config["max_history"] = self._max_history return llm_config diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py index 9156a01..1fc4b22 100644 --- a/src/agora_agent/agentkit/vendors/llm.py +++ b/src/agora_agent/agentkit/vendors/llm.py @@ -376,8 +376,14 @@ def to_config(self) -> Dict[str, Any]: options = _dump_optional_model(self.options) options.pop("project_id", None) options.pop("location", None) + if not options.get("url"): + options["url"] = ( + f"https://{self.options.location}-aiplatform.googleapis.com/v1/projects/" + f"{self.options.project_id}/locations/{self.options.location}/" + f"publishers/google/models/{self.options.model}:streamGenerateContent?alt=sse" + ) config = Gemini(**options).to_config() - params = dict(config["params"]) + params = dict(config.get("params") or {}) params["project_id"] = self.options.project_id params["location"] = self.options.location config["params"] = params diff --git a/src/agora_agent/agentkit/vendors/stt.py b/src/agora_agent/agentkit/vendors/stt.py index 383147a..d390573 100644 --- a/src/agora_agent/agentkit/vendors/stt.py +++ b/src/agora_agent/agentkit/vendors/stt.py @@ -44,6 +44,7 @@ class DeepgramSTTOptions(BaseModel): api_key: Optional[str] = Field(default=None, description="Deepgram API key") model: Optional[str] = Field(default=None, description="Model (e.g., nova-2, enhanced, base)") language: Optional[str] = Field(default=None, description="Language code (e.g., en-US)") + keyterm: Optional[str] = Field(default=None, description="Boost specialized terms and brands for Deepgram") smart_format: Optional[bool] = Field(default=None, description="Enable smart formatting") punctuation: Optional[bool] = Field(default=None, description="Enable punctuation") additional_params: Optional[Dict[str, Any]] = Field(default=None) @@ -71,6 +72,8 @@ def to_config(self) -> Dict[str, Any]: params["smart_format"] = self.options.smart_format if self.options.punctuation is not None: params["punctuation"] = self.options.punctuation + if self.options.keyterm is not None: + params["keyterm"] = self.options.keyterm config: Dict[str, Any] = { "vendor": "deepgram", "params": params, @@ -124,13 +127,20 @@ def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = dict(self.options.additional_params or {}) params["api_key"] = self.options.api_key - transcription = {"model": "whisper-1", **(self.options.input_audio_transcription or {})} + transcription: Dict[str, Any] = {"model": "gpt-4o-mini-transcribe"} + transcription.update(self.options.input_audio_transcription or {}) if self.options.model is not None: transcription["model"] = self.options.model if self.options.prompt is not None: transcription["prompt"] = self.options.prompt if self.options.language is not None: transcription["language"] = self.options.language + if not transcription.get("model"): + raise ValueError("OpenAISTT: input_audio_transcription.model is required") + if not transcription.get("prompt"): + raise ValueError("OpenAISTT: input_audio_transcription.prompt is required") + if not transcription.get("language"): + raise ValueError("OpenAISTT: input_audio_transcription.language is required") params["input_audio_transcription"] = transcription config: Dict[str, Any] = { diff --git a/src/agora_agent/agentkit/vendors/tts.py b/src/agora_agent/agentkit/vendors/tts.py index a052ea5..9120d7d 100644 --- a/src/agora_agent/agentkit/vendors/tts.py +++ b/src/agora_agent/agentkit/vendors/tts.py @@ -114,7 +114,7 @@ def _validate_byok_params(self) -> "OpenAITTSOptions": ("model", self.model), ("base_url", self.base_url), ) - if value is None + if not value ] if missing: raise ValueError(f"OpenAITTS requires {', '.join(missing)} when api_key is set") From 70926e12f8b78b4e6eb0b144d32798cd8b869f2c Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Thu, 4 Jun 2026 22:10:46 -0400 Subject: [PATCH 06/14] fix(agentkit): implement MiniMax preset model stripping in to_config and preset resolution - MiniMaxTTS.to_config: model, key, group_id, url now only emitted in params on the BYOK path (key set); for managed preset path, model is stored as _minimax_preset_model at the config top level as an inference hint - presets.py infer_tts_preset: falls back to _minimax_preset_model when model is absent from params (preset path) - strip_inferred_preset_fields: pops _minimax_preset_model from the tts dict so the hint never reaches the wire --- src/agora_agent/agentkit/presets.py | 5 ++++- src/agora_agent/agentkit/vendors/tts.py | 11 +++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/agora_agent/agentkit/presets.py b/src/agora_agent/agentkit/presets.py index 68d27df..4f1b145 100644 --- a/src/agora_agent/agentkit/presets.py +++ b/src/agora_agent/agentkit/presets.py @@ -137,7 +137,9 @@ def infer_tts_preset(tts: typing.Optional[typing.Dict[str, typing.Any]]) -> typi if vendor == "minimax": if params.get("key"): return None - return _MINIMAX_MODEL_TO_PRESET.get(_normalize_model_name(params.get("model")) or "") + # Model is no longer in params for the preset path; fall back to the top-level hint. + model = _normalize_model_name(params.get("model")) or _normalize_model_name(tts.get("_minimax_preset_model")) or "" + return _MINIMAX_MODEL_TO_PRESET.get(model) return None @@ -184,6 +186,7 @@ def strip_inferred_preset_fields(properties: typing.Dict[str, typing.Any], infer params["group_id"] = None params["url"] = None tts = {k: v for k, v in {**tts, "params": _omit_none(params)}.items() if v is not None} + tts.pop("_minimax_preset_model", None) return {**properties, "asr": asr, "llm": llm, "tts": tts} diff --git a/src/agora_agent/agentkit/vendors/tts.py b/src/agora_agent/agentkit/vendors/tts.py index 9120d7d..acfec78 100644 --- a/src/agora_agent/agentkit/vendors/tts.py +++ b/src/agora_agent/agentkit/vendors/tts.py @@ -436,17 +436,20 @@ def sample_rate(self) -> Optional[int]: return None def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = {"model": self.options.model} + params: Dict[str, Any] = {} if self.options.key is not None: params["key"] = self.options.key - if self.options.group_id is not None: params["group_id"] = self.options.group_id + params["model"] = self.options.model + params["url"] = self.options.url if self.options.voice_id is not None: params["voice_setting"] = {"voice_id": self.options.voice_id} - if self.options.url is not None: - params["url"] = self.options.url result: Dict[str, Any] = {"vendor": "minimax", "params": params} + if self.options.key is None: + # Preset path: model not in params; stored as top-level hint for preset + # inference. Stripped by strip_inferred_preset_fields before the POST body. + result["_minimax_preset_model"] = self.options.model if self.options.skip_patterns is not None: result["skip_patterns"] = self.options.skip_patterns return result From d2cb0319f46cc7197b10778294ae8f1a537d0323 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Fri, 5 Jun 2026 09:14:12 -0400 Subject: [PATCH 07/14] updated mllm vendors --- src/agora_agent/agentkit/vendors/mllm.py | 7 +++---- tests/custom/test_agentkit_vendors.py | 8 +++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py index 4fd47a4..6a260d8 100644 --- a/src/agora_agent/agentkit/vendors/mllm.py +++ b/src/agora_agent/agentkit/vendors/mllm.py @@ -179,12 +179,11 @@ def to_config(self) -> Dict[str, Any]: if self.options.http_options is not None: params["http_options"] = self.options.http_options - params["project_id"] = self.options.project_id - params["location"] = self.options.location - params["adc_credentials_string"] = self.options.adc_credentials_string - config: Dict[str, Any] = { "vendor": "vertexai", + "project_id": self.options.project_id, + "location": self.options.location, + "adc_credentials_string": self.options.adc_credentials_string, "params": params, } diff --git a/tests/custom/test_agentkit_vendors.py b/tests/custom/test_agentkit_vendors.py index e00ecc5..e84b11e 100644 --- a/tests/custom/test_agentkit_vendors.py +++ b/tests/custom/test_agentkit_vendors.py @@ -79,10 +79,12 @@ def test_vertex_ai_explicit_fields_override_additional_params(): ).to_config() assert config["vendor"] == "vertexai" + # routing fields are top-level, not inside params + assert config["project_id"] == "explicit-project" + assert config["location"] == "explicit-region" + assert config["adc_credentials_string"] == "{}" + # model and extra_key live inside params assert config["params"]["model"] == "explicit-model" - assert config["params"]["project_id"] == "explicit-project" - assert config["params"]["location"] == "explicit-region" - assert config["params"]["adc_credentials_string"] == "{}" assert config["params"]["extra_key"] == "kept" From bbed4e2f6448de3c9e85a74a70997d42c5f3fbbe Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Fri, 5 Jun 2026 09:58:57 -0400 Subject: [PATCH 08/14] Add full request body coverage and fix preset inference --- src/agora_agent/agentkit/presets.py | 2 +- src/agora_agent/agentkit/vendors/tts.py | 8 +- tests/custom/test_request_body.py | 967 ++++++++++++++++++++++++ tests/custom/test_tts_vendors.py | 6 +- 4 files changed, 975 insertions(+), 8 deletions(-) create mode 100644 tests/custom/test_request_body.py diff --git a/src/agora_agent/agentkit/presets.py b/src/agora_agent/agentkit/presets.py index 4f1b145..4d3409d 100644 --- a/src/agora_agent/agentkit/presets.py +++ b/src/agora_agent/agentkit/presets.py @@ -108,7 +108,7 @@ def infer_asr_preset(asr: typing.Optional[typing.Dict[str, typing.Any]]) -> typi if not asr or asr.get("vendor") != "deepgram": return None params = asr.get("params") or {} - if params.get("api_key"): + if params.get("key"): return None return _DEEPGRAM_MODEL_TO_PRESET.get(_normalize_model_name(params.get("model")) or "") diff --git a/src/agora_agent/agentkit/vendors/tts.py b/src/agora_agent/agentkit/vendors/tts.py index acfec78..0a7b699 100644 --- a/src/agora_agent/agentkit/vendors/tts.py +++ b/src/agora_agent/agentkit/vendors/tts.py @@ -214,13 +214,13 @@ def sample_rate(self) -> Optional[int]: def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { "credentials": self.options.key, - "VoiceSelectionParams": {"name": self.options.voice_name}, + "voice_selection_params": {"name": self.options.voice_name}, } if self.options.language_code is not None: - params["VoiceSelectionParams"]["language_code"] = self.options.language_code + params["voice_selection_params"]["language_code"] = self.options.language_code if self.options.sample_rate_hertz is not None: - params["AudioConfig"] = {"sample_rate_hertz": self.options.sample_rate_hertz} + params["audio_config"] = {"sample_rate_hertz": self.options.sample_rate_hertz} result: Dict[str, Any] = {"vendor": "google", "params": params} if self.options.skip_patterns is not None: @@ -359,7 +359,7 @@ def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { "api_key": self.options.key, "speaker": self.options.speaker, - "modelId": self.options.model_id, + "model_id": self.options.model_id, } if self.options.base_url is not None: params["base_url"] = self.options.base_url diff --git a/tests/custom/test_request_body.py b/tests/custom/test_request_body.py new file mode 100644 index 0000000..bf985f0 --- /dev/null +++ b/tests/custom/test_request_body.py @@ -0,0 +1,967 @@ +""" +test_request_body.py — Integration-level tests for request body shape. + +Covers: + Scenario 1 — BYOK pipeline (full properties shape) + Scenario 2 — Preset-backed pipeline (managed vendors, field-stripping) + Scenario 3 — LLM config fields win over agent-level convenience fields + Scenario 4 — VertexAILLM URL construction + Scenario 5 — OpenAISTT params (5a model, 5b prompt, 5c language, 5d defaults) + Scenario 6 — Mixed preset + BYOK (6a ASR preset + BYOK LLM/TTS, 6b TTS preset + BYOK LLM/ASR) + Scenario 7 — Pipeline ID (7b shape with BYOK LLM, 7c empty properties) + Scenario 8 — MLLM mode (8a start call, 8b/8c agent-level greeting wins/vendor wins) + BYOK vendor coverage matrix (all STT, LLM, TTS vendors) + Preset coverage matrix (all inferred presets) +""" + +from __future__ import annotations + +from types import SimpleNamespace + +import pytest + +from agora_agent import ( + Agent, + AmazonBedrock, + AmazonSTT, + AmazonTTS, + Anthropic, + AresSTT, + AssemblyAISTT, + AzureOpenAI, + CartesiaTTS, + CustomLLM, + DeepgramSTT, + DeepgramTTS, + Dify, + ElevenLabsTTS, + FishAudioTTS, + Gemini, + GeminiLive, + GoogleSTT, + GoogleTTS, + Groq, + HumeAITTS, + MicrosoftSTT, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAI, + OpenAIRealtime, + OpenAISTT, + OpenAITTS, + RimeTTS, + SarvamSTT, + SarvamTTS, + SpeechmaticsSTT, + VertexAI, + VertexAILLM, + XaiGrok, +) +from agora_agent.agentkit import AgentSession +from agora_agent.agentkit.presets import resolve_session_presets + + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + +APP_ID = "0" * 32 +APP_CERTIFICATE = "1" * 32 + + +def dump(value): + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + +# --------------------------------------------------------------------------- +# Pattern 1: FakeAgentsClient — captures the full start() call +# --------------------------------------------------------------------------- + + +class StartResponse: + agent_id = "agent-id" + + +class FakeAgentsClient: + def __init__(self): + self.calls = [] + + def start(self, appid, **kwargs): + self.calls.append({"appid": appid, **kwargs}) + return StartResponse() + + +class FakeClient: + app_id = "appid" + app_certificate = None + + def __init__(self, agents): + self.agents = agents + + +def start_session(agent, **session_kwargs): + """Start agent session via FakeAgentsClient and return the captured call dict.""" + agents = FakeAgentsClient() + client = FakeClient(agents) + agent.create_session( + client=client, + channel="channel", + token="test-token", + agent_uid="1", + remote_uids=["100"], + **session_kwargs, + ).start() + return agents.calls[0] + + +# --------------------------------------------------------------------------- +# Pattern 2: _build_start_properties — properties-only shape +# --------------------------------------------------------------------------- + + +class _Agents: + def start(self, app_id, name, properties, preset=None, pipeline_id=None, request_options=None): + return SimpleNamespace(agent_id="agent-1") + + +class _Client: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _Agents() + self.agent_management = object() + + +def build_properties(agent, allow_missing=None): + session = AgentSession( + client=_Client(), + agent=agent, + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="channel", + agent_uid="1", + remote_uids=["100"], + ) + return session._build_start_properties( # noqa: SLF001 + {"app_id": APP_ID, "app_certificate": APP_CERTIFICATE}, + skip_vendor_validation_categories=set(), + allow_missing_vendor_categories=allow_missing or set(), + ) + + +# =========================================================================== +# Scenario 1 — BYOK pipeline (full properties shape) +# =========================================================================== + + +def test_byok_pipeline_full_properties_shape() -> None: + """OpenAI BYOK LLM + Deepgram BYOK STT + ElevenLabs TTS produces expected properties.""" + agent = ( + Agent(name="support") + .with_stt(DeepgramSTT(api_key="dg-key", model="nova-2", language="en")) + .with_llm( + OpenAI( + api_key="openai-key", + base_url="https://api.openai.com/v1/chat/completions", + model="gpt-4o", + ) + ) + .with_tts( + ElevenLabsTTS( + key="el-key", + model_id="eleven_flash_v2_5", + voice_id="voice123", + base_url="wss://api.elevenlabs.io/v1", + ) + ) + ) + + props = build_properties(agent) + + # RTC routing + assert props["channel"] == "channel" + assert props["agent_rtc_uid"] == "1" + assert props["remote_rtc_uids"] == ["100"] + + # ASR + asr = props["asr"] + assert asr["vendor"] == "deepgram" + assert asr["params"]["key"] == "dg-key" + assert asr["params"]["model"] == "nova-2" + assert asr["params"]["language"] == "en" + + # LLM + llm = props["llm"] + assert llm["api_key"] == "openai-key" + assert llm["style"] == "openai" + assert llm["params"]["model"] == "gpt-4o" + + # TTS + tts = props["tts"] + assert tts["vendor"] == "elevenlabs" + assert tts["params"]["key"] == "el-key" + assert tts["params"]["model_id"] == "eleven_flash_v2_5" + assert tts["params"]["voice_id"] == "voice123" + + +# =========================================================================== +# Scenario 2 — Preset-backed pipeline (full start request, field stripping) +# =========================================================================== + + +def test_managed_llm_and_tts_produce_preset_and_strip_fields() -> None: + """Managed OpenAI LLM + MiniMax TTS generate preset string and strip BYOK fields.""" + agent = ( + Agent(name="support") + .with_llm(OpenAI(model="gpt-4o-mini")) + .with_tts(MiniMaxTTS(model="speech_2_8_turbo", voice_id="English_captivating_female1")) + ) + + call = start_session(agent) + assert "openai_gpt_4o_mini" in (call["preset"] or "") + assert "minimax_speech_2_8_turbo" in (call["preset"] or "") + + properties = dump(call["properties"]) + # api_key and url stripped for managed LLM + assert "api_key" not in properties.get("llm", {}) + # vendor retained for TTS + assert properties["tts"]["vendor"] == "minimax" + # BYOK key stripped for managed TTS + assert "key" not in properties["tts"].get("params", {}) + + +# =========================================================================== +# Scenario 3 — LLM config wins over agent-level fields +# =========================================================================== + + +def test_llm_config_greeting_wins_over_agent_level_greeting() -> None: + """When OpenAI vendor sets greeting_message it overrides agent.with_greeting().""" + agent = ( + Agent(name="support") + .with_llm( + OpenAI( + api_key="openai-key", + base_url="https://api.openai.com/v1/chat/completions", + model="gpt-4o", + greeting_message="vendor greeting", + ) + ) + .with_greeting("agent greeting") + ) + + props = build_properties(agent, allow_missing={"asr", "tts"}) + assert props["llm"]["greeting_message"] == "vendor greeting" + + +# =========================================================================== +# Scenario 4 — VertexAILLM URL construction +# =========================================================================== + + +def test_vertex_ai_llm_constructs_correct_url_and_params() -> None: + """VertexAILLM auto-constructs the aiplatform URL and injects project_id/location into params.""" + agent = Agent(name="support").with_llm( + VertexAILLM( + api_key="vertex-token", + model="gemini-2.0-flash", + project_id="my-project", + location="us-central1", + ) + ) + + props = build_properties(agent, allow_missing={"asr", "tts"}) + llm = props["llm"] + + expected_url_fragment = "us-central1-aiplatform.googleapis.com" + assert expected_url_fragment in llm["url"] + assert "my-project" in llm["url"] + assert llm["style"] == "gemini" + assert llm["params"]["project_id"] == "my-project" + assert llm["params"]["location"] == "us-central1" + assert llm["params"]["model"] == "gemini-2.0-flash" + + +# =========================================================================== +# Scenario 5 — OpenAISTT params +# =========================================================================== + + +def test_openai_stt_5a_model_param_is_sent() -> None: + """5a: OpenAISTT model appears inside input_audio_transcription.model.""" + agent = Agent(name="support").with_stt( + OpenAISTT( + api_key="oai-key", + model="gpt-4o-mini-transcribe", + prompt="transcribe clearly", + language="en", + ) + ) + + props = build_properties(agent, allow_missing={"llm", "tts"}) + transcription = props["asr"]["params"]["input_audio_transcription"] + assert transcription["model"] == "gpt-4o-mini-transcribe" + + +def test_openai_stt_5b_prompt_param_is_sent() -> None: + """5b: OpenAISTT prompt appears inside input_audio_transcription.prompt.""" + agent = Agent(name="support").with_stt( + OpenAISTT( + api_key="oai-key", + model="gpt-4o-mini-transcribe", + prompt="use proper nouns", + language="en", + ) + ) + + props = build_properties(agent, allow_missing={"llm", "tts"}) + transcription = props["asr"]["params"]["input_audio_transcription"] + assert transcription["prompt"] == "use proper nouns" + + +def test_openai_stt_5c_language_param_is_sent() -> None: + """5c: OpenAISTT language appears inside input_audio_transcription.language.""" + agent = Agent(name="support").with_stt( + OpenAISTT( + api_key="oai-key", + model="gpt-4o-mini-transcribe", + prompt="some prompt", + language="fr", + ) + ) + + props = build_properties(agent, allow_missing={"llm", "tts"}) + transcription = props["asr"]["params"]["input_audio_transcription"] + assert transcription["language"] == "fr" + + +def test_openai_stt_5d_api_key_is_top_level_in_params() -> None: + """5d: OpenAISTT api_key is a top-level key inside asr.params (not inside input_audio_transcription).""" + agent = Agent(name="support").with_stt( + OpenAISTT( + api_key="oai-key", + model="gpt-4o-mini-transcribe", + prompt="some prompt", + language="en", + ) + ) + + props = build_properties(agent, allow_missing={"llm", "tts"}) + asr_params = props["asr"]["params"] + assert asr_params["api_key"] == "oai-key" + assert "api_key" not in asr_params.get("input_audio_transcription", {}) + + +# =========================================================================== +# Scenario 6 — Mixed preset + BYOK +# =========================================================================== + + +def test_6a_asr_preset_with_byok_llm_and_tts() -> None: + """6a: Managed Deepgram ASR preset + BYOK LLM + BYOK TTS.""" + agent = ( + Agent(name="support") + .with_stt(DeepgramSTT(model="nova-3", language="en-US")) + .with_llm( + OpenAI( + api_key="openai-key", + base_url="https://api.openai.com/v1/chat/completions", + model="gpt-4o", + ) + ) + .with_tts( + ElevenLabsTTS( + key="el-key", + model_id="eleven_flash_v2_5", + voice_id="voice123", + base_url="wss://api.elevenlabs.io/v1", + ) + ) + ) + + call = start_session(agent) + preset = call.get("preset") or "" + assert "deepgram_nova_3" in preset + # No LLM or TTS preset inferred + assert "openai_gpt" not in preset + assert "openai_tts" not in preset + + properties = dump(call["properties"]) + assert properties["llm"]["api_key"] == "openai-key" + assert properties["tts"]["vendor"] == "elevenlabs" + + +def test_6b_tts_preset_with_byok_llm_and_asr() -> None: + """6b: Managed OpenAITTS preset + BYOK LLM + BYOK Deepgram ASR.""" + agent = ( + Agent(name="support") + .with_stt(DeepgramSTT(api_key="dg-key", model="nova-2", language="en-US")) + .with_llm( + OpenAI( + api_key="openai-key", + base_url="https://api.openai.com/v1/chat/completions", + model="gpt-4o", + ) + ) + .with_tts(OpenAITTS(voice="alloy")) + ) + + call = start_session(agent) + preset = call.get("preset") or "" + assert "openai_tts_1" in preset + assert "deepgram_nova_2" not in preset # BYOK key present — no ASR preset inferred + + properties = dump(call["properties"]) + # BYOK ASR: key and model both retained (nothing stripped for BYOK path) + assert properties["asr"]["params"]["key"] == "dg-key" + assert properties["asr"]["params"]["model"] == "nova-2" + # BYOK LLM key retained + assert properties["llm"]["api_key"] == "openai-key" + # TTS api_key stripped (managed) + assert "api_key" not in properties["tts"].get("params", {}) + + +# =========================================================================== +# Scenario 7 — Pipeline ID +# =========================================================================== + + +def test_7b_pipeline_id_with_byok_llm_override() -> None: + """7b: pipeline_id present, single LLM override, ASR/TTS absent from properties.""" + agent = Agent(name="support", pipeline_id="studio-pipeline").with_llm( + OpenAI( + api_key="openai-key", + base_url="https://api.openai.com/v1/chat/completions", + model="gpt-4o", + ) + ) + + call = start_session(agent) + assert call["pipeline_id"] == "studio-pipeline" + properties = dump(call["properties"]) + assert properties["llm"]["api_key"] == "openai-key" + assert "asr" not in properties + assert "tts" not in properties + + +def test_7c_pipeline_id_empty_properties_no_vendors() -> None: + """7c: pipeline_id alone — no vendor keys in properties.""" + agent = Agent(name="support", pipeline_id="studio-pipeline") + + call = start_session(agent) + assert call["pipeline_id"] == "studio-pipeline" + properties = dump(call["properties"]) + assert "asr" not in properties + assert "llm" not in properties + assert "tts" not in properties + + +# =========================================================================== +# Scenario 8 — MLLM mode +# =========================================================================== + + +def test_8a_mllm_start_call_has_correct_top_level_vendor() -> None: + """8a: OpenAIRealtime MLLM session – start call contains mllm with vendor=openai.""" + agent = Agent(name="support").with_mllm( + OpenAIRealtime(api_key="realtime-key", model="gpt-4o-realtime-preview", voice="coral") + ) + + call = start_session(agent) + properties = dump(call["properties"]) + assert "mllm" in properties + mllm = properties["mllm"] + assert mllm["vendor"] == "openai" + assert mllm["api_key"] == "realtime-key" + assert mllm["params"]["model"] == "gpt-4o-realtime-preview" + assert mllm["params"]["voice"] == "coral" + + +def test_8b_agent_greeting_fills_mllm_when_vendor_omits_it() -> None: + """8b: agent.with_greeting() fills mllm.greeting_message when vendor does not set it.""" + agent = ( + Agent(name="support") + .with_mllm(OpenAIRealtime(api_key="realtime-key")) + .with_greeting("hello from agent") + ) + + props = build_properties(agent) + assert props["mllm"]["greeting_message"] == "hello from agent" + + +def test_8c_vendor_greeting_wins_over_agent_level_greeting_in_mllm() -> None: + """8c: Vendor-level greeting_message wins over agent.with_greeting() in MLLM mode.""" + agent = ( + Agent(name="support") + .with_mllm( + OpenAIRealtime( + api_key="realtime-key", + greeting_message="vendor greeting", + ) + ) + .with_greeting("agent greeting") + ) + + props = build_properties(agent) + assert props["mllm"]["greeting_message"] == "vendor greeting" + + +# =========================================================================== +# BYOK Vendor Coverage Matrix — STT vendors +# =========================================================================== + + +def test_byok_deepgram_stt_params() -> None: + agent = Agent(name="t").with_stt( + DeepgramSTT(api_key="dg-key", model="nova-2", language="en") + ) + props = build_properties(agent, allow_missing={"llm", "tts"}) + assert props["asr"]["vendor"] == "deepgram" + assert props["asr"]["params"]["key"] == "dg-key" + assert props["asr"]["params"]["model"] == "nova-2" + assert props["asr"]["params"]["language"] == "en" + + +def test_byok_microsoft_stt_params() -> None: + agent = Agent(name="t").with_stt( + MicrosoftSTT(key="ms-key", region="eastus", language="en-US") + ) + props = build_properties(agent, allow_missing={"llm", "tts"}) + assert props["asr"]["vendor"] == "microsoft" + assert props["asr"]["params"]["key"] == "ms-key" + assert props["asr"]["params"]["region"] == "eastus" + assert props["asr"]["params"]["language"] == "en-US" + + +def test_byok_google_stt_params() -> None: + agent = Agent(name="t").with_stt( + GoogleSTT( + project_id="my-project", + location="global", + adc_credentials_string="{}", + language="en-US", + model="long", + ) + ) + props = build_properties(agent, allow_missing={"llm", "tts"}) + assert props["asr"]["vendor"] == "google" + p = props["asr"]["params"] + assert p["project_id"] == "my-project" + assert p["location"] == "global" + assert p["language"] == "en-US" + assert p["model"] == "long" + + +def test_byok_amazon_stt_params() -> None: + agent = Agent(name="t").with_stt( + AmazonSTT(access_key="ak", secret_key="sk", region="us-east-1", language="en-US") + ) + props = build_properties(agent, allow_missing={"llm", "tts"}) + assert props["asr"]["vendor"] == "amazon" + p = props["asr"]["params"] + assert p["access_key_id"] == "ak" + assert p["secret_access_key"] == "sk" + assert p["region"] == "us-east-1" + assert p["language_code"] == "en-US" + + +def test_byok_assemblyai_stt_params() -> None: + agent = Agent(name="t").with_stt( + AssemblyAISTT(api_key="assembly-key", language="en-US") + ) + props = build_properties(agent, allow_missing={"llm", "tts"}) + assert props["asr"]["vendor"] == "assemblyai" + assert props["asr"]["params"]["api_key"] == "assembly-key" + assert props["asr"]["params"]["language"] == "en-US" + + +def test_byok_ares_stt_no_params() -> None: + agent = Agent(name="t").with_stt(AresSTT()) + props = build_properties(agent, allow_missing={"llm", "tts"}) + assert props["asr"]["vendor"] == "ares" + assert "params" not in props["asr"] + + +def test_byok_speechmatics_stt_params() -> None: + agent = Agent(name="t").with_stt( + SpeechmaticsSTT(api_key="sm-key", language="en") + ) + props = build_properties(agent, allow_missing={"llm", "tts"}) + assert props["asr"]["vendor"] == "speechmatics" + assert props["asr"]["params"]["api_key"] == "sm-key" + assert props["asr"]["params"]["language"] == "en" + + +def test_byok_sarvam_stt_params() -> None: + agent = Agent(name="t").with_stt( + SarvamSTT(api_key="sarvam-key", language="en-IN") + ) + props = build_properties(agent, allow_missing={"llm", "tts"}) + assert props["asr"]["vendor"] == "sarvam" + assert props["asr"]["params"]["api_key"] == "sarvam-key" + assert props["asr"]["params"]["language"] == "en-IN" + + +# --------------------------------------------------------------------------- +# BYOK Vendor Coverage Matrix — LLM vendors +# --------------------------------------------------------------------------- + + +def test_byok_openai_llm_params() -> None: + agent = Agent(name="t").with_llm( + OpenAI( + api_key="openai-key", + base_url="https://api.openai.com/v1/chat/completions", + model="gpt-4o", + ) + ) + props = build_properties(agent, allow_missing={"asr", "tts"}) + assert props["llm"]["api_key"] == "openai-key" + assert props["llm"]["style"] == "openai" + assert props["llm"]["params"]["model"] == "gpt-4o" + + +def test_byok_azure_openai_llm_params() -> None: + agent = Agent(name="t").with_llm( + AzureOpenAI( + api_key="azure-key", + endpoint="https://example.openai.azure.com", + deployment_name="my-deployment", + model="gpt-4o", + ) + ) + props = build_properties(agent, allow_missing={"asr", "tts"}) + assert props["llm"]["api_key"] == "azure-key" + assert props["llm"]["style"] == "openai" + assert props["llm"]["params"]["model"] == "gpt-4o" + + +def test_byok_anthropic_llm_params() -> None: + agent = Agent(name="t").with_llm( + Anthropic( + api_key="anthropic-key", + model="claude-3-5-sonnet-20241022", + url="https://api.anthropic.com/v1/messages", + headers={"anthropic-version": "2023-06-01"}, + max_tokens=1024, + ) + ) + props = build_properties(agent, allow_missing={"asr", "tts"}) + assert props["llm"]["api_key"] == "anthropic-key" + assert props["llm"]["style"] == "anthropic" + assert props["llm"]["headers"]["anthropic-version"] == "2023-06-01" + assert props["llm"]["params"]["max_tokens"] == 1024 + + +def test_byok_gemini_llm_params() -> None: + agent = Agent(name="t").with_llm( + Gemini(api_key="gemini-key", model="gemini-2.0-flash") + ) + props = build_properties(agent, allow_missing={"asr", "tts"}) + assert props["llm"]["api_key"] == "gemini-key" + assert props["llm"]["style"] == "gemini" + assert props["llm"]["params"]["model"] == "gemini-2.0-flash" + + +def test_byok_groq_llm_params() -> None: + agent = Agent(name="t").with_llm( + Groq( + api_key="groq-key", + model="llama-3.3-70b-versatile", + base_url="https://api.groq.com/openai/v1/chat/completions", + ) + ) + props = build_properties(agent, allow_missing={"asr", "tts"}) + assert props["llm"]["api_key"] == "groq-key" + assert props["llm"]["style"] == "openai" + assert props["llm"]["params"]["model"] == "llama-3.3-70b-versatile" + + +def test_byok_custom_llm_params() -> None: + agent = Agent(name="t").with_llm( + CustomLLM( + api_key="custom-key", + model="my-model", + base_url="https://llm.example.com/chat", + ) + ) + props = build_properties(agent, allow_missing={"asr", "tts"}) + assert props["llm"]["api_key"] == "custom-key" + assert props["llm"]["vendor"] == "custom" + assert props["llm"]["style"] == "openai" + + +def test_byok_amazon_bedrock_llm_params() -> None: + agent = Agent(name="t").with_llm( + AmazonBedrock( + access_key="aws-access", + secret_key="aws-secret", + region="us-east-1", + model="anthropic.claude-3-5-sonnet-20241022-v2:0", + ) + ) + props = build_properties(agent, allow_missing={"asr", "tts"}) + assert props["llm"]["style"] == "bedrock" + assert props["llm"]["access_key"] == "aws-access" + assert "us-east-1" in props["llm"]["url"] + + +def test_byok_dify_llm_params() -> None: + agent = Agent(name="t").with_llm( + Dify( + api_key="dify-key", + url="https://api.dify.ai/v1/chat-messages", + model="default", + ) + ) + props = build_properties(agent, allow_missing={"asr", "tts"}) + assert props["llm"]["api_key"] == "dify-key" + assert props["llm"]["style"] == "dify" + assert props["llm"]["params"]["model"] == "default" + + +# --------------------------------------------------------------------------- +# BYOK Vendor Coverage Matrix — TTS vendors +# --------------------------------------------------------------------------- + + +def test_byok_elevenlabs_tts_params() -> None: + agent = Agent(name="t").with_tts( + ElevenLabsTTS( + key="el-key", + model_id="eleven_flash_v2_5", + voice_id="voice", + base_url="wss://api.elevenlabs.io/v1", + ) + ) + props = build_properties(agent, allow_missing={"asr", "llm"}) + assert props["tts"]["vendor"] == "elevenlabs" + assert props["tts"]["params"]["key"] == "el-key" + assert props["tts"]["params"]["model_id"] == "eleven_flash_v2_5" + assert props["tts"]["params"]["voice_id"] == "voice" + + +def test_byok_microsoft_tts_params() -> None: + agent = Agent(name="t").with_tts( + MicrosoftTTS(key="ms-key", region="eastus", voice_name="en-US-JennyNeural") + ) + props = build_properties(agent, allow_missing={"asr", "llm"}) + assert props["tts"]["vendor"] == "microsoft" + assert props["tts"]["params"]["key"] == "ms-key" + assert props["tts"]["params"]["region"] == "eastus" + assert props["tts"]["params"]["voice_name"] == "en-US-JennyNeural" + + +def test_byok_openai_tts_params() -> None: + agent = Agent(name="t").with_tts( + OpenAITTS( + api_key="oai-tts-key", + voice="alloy", + model="tts-1-hd", + base_url="https://api.openai.com/v1", + ) + ) + props = build_properties(agent, allow_missing={"asr", "llm"}) + assert props["tts"]["vendor"] == "openai" + assert props["tts"]["params"]["api_key"] == "oai-tts-key" + assert props["tts"]["params"]["model"] == "tts-1-hd" + assert props["tts"]["params"]["voice"] == "alloy" + + +def test_byok_cartesia_tts_params() -> None: + agent = Agent(name="t").with_tts( + CartesiaTTS(api_key="cartesia-key", voice_id="voice", model_id="sonic-2", sample_rate=24000) + ) + props = build_properties(agent, allow_missing={"asr", "llm"}) + assert props["tts"]["vendor"] == "cartesia" + p = props["tts"]["params"] + assert p["api_key"] == "cartesia-key" + assert p["voice"] == {"mode": "id", "id": "voice"} + + +def test_byok_google_tts_params() -> None: + config = GoogleTTS(key="{}", voice_name="en-US-JennyNeural", language_code="en-US", sample_rate_hertz=24000).to_config() + assert config["vendor"] == "google" + p = config["params"] + assert p["credentials"] == "{}" + assert p["voice_selection_params"]["name"] == "en-US-JennyNeural" + assert p["voice_selection_params"]["language_code"] == "en-US" + + +def test_byok_amazon_tts_params() -> None: + agent = Agent(name="t").with_tts( + AmazonTTS(access_key="access", secret_key="secret", region="us-east-1", voice_id="Joanna", engine="neural") + ) + props = build_properties(agent, allow_missing={"asr", "llm"}) + assert props["tts"]["vendor"] == "amazon" + p = props["tts"]["params"] + assert p["aws_access_key_id"] == "access" + assert p["aws_secret_access_key"] == "secret" + assert p["voice"] == "Joanna" + + +def test_byok_deepgram_tts_params() -> None: + agent = Agent(name="t").with_tts( + DeepgramTTS(api_key="dg-tts-key", model="aura-2-thalia-en", base_url="wss://api.deepgram.com/v1/speak", sample_rate=24000) + ) + props = build_properties(agent, allow_missing={"asr", "llm"}) + assert props["tts"]["vendor"] == "deepgram" + assert props["tts"]["params"]["api_key"] == "dg-tts-key" + assert props["tts"]["params"]["model"] == "aura-2-thalia-en" + + +def test_byok_humeai_tts_params() -> None: + agent = Agent(name="t").with_tts( + HumeAITTS(key="hume-key", voice_id="voice", provider="CUSTOM_VOICE") + ) + props = build_properties(agent, allow_missing={"asr", "llm"}) + assert props["tts"]["vendor"] == "humeai" + assert props["tts"]["params"]["key"] == "hume-key" + assert props["tts"]["params"]["voice_id"] == "voice" + + +def test_byok_rime_tts_params() -> None: + config = RimeTTS(key="rime-key", speaker="speaker", model_id="mist").to_config() + assert config["vendor"] == "rime" + assert config["params"]["api_key"] == "rime-key" + assert config["params"]["speaker"] == "speaker" + assert config["params"]["model_id"] == "mist" + + +def test_byok_fishaudio_tts_params() -> None: + agent = Agent(name="t").with_tts( + FishAudioTTS(key="fish-key", reference_id="ref", backend="speech-1.5") + ) + props = build_properties(agent, allow_missing={"asr", "llm"}) + assert props["tts"]["vendor"] == "fishaudio" + assert props["tts"]["params"]["api_key"] == "fish-key" + assert props["tts"]["params"]["reference_id"] == "ref" + + +def test_byok_minimax_byok_tts_params() -> None: + agent = Agent(name="t").with_tts( + MiniMaxTTS( + key="mm-key", + group_id="group", + model="speech-02-turbo", + voice_id="voice", + url="wss://api-uw.minimax.io/ws/v1/t2a_v2", + ) + ) + props = build_properties(agent, allow_missing={"asr", "llm"}) + assert props["tts"]["vendor"] == "minimax" + assert props["tts"]["params"]["key"] == "mm-key" + + +def test_byok_sarvam_tts_params() -> None: + agent = Agent(name="t").with_tts( + SarvamTTS(key="sarvam-key", speaker="anushka", target_language_code="en-IN", sample_rate=24000) + ) + props = build_properties(agent, allow_missing={"asr", "llm"}) + assert props["tts"]["vendor"] == "sarvam" + assert props["tts"]["params"]["api_subscription_key"] == "sarvam-key" + assert props["tts"]["params"]["speaker"] == "anushka" + + +def test_byok_murf_tts_params() -> None: + agent = Agent(name="t").with_tts(MurfTTS(key="murf-key")) + props = build_properties(agent, allow_missing={"asr", "llm"}) + assert props["tts"]["vendor"] == "murf" + assert props["tts"]["params"]["api_key"] == "murf-key" + + +# --------------------------------------------------------------------------- +# BYOK Vendor Coverage Matrix — MLLM vendors +# --------------------------------------------------------------------------- + + +def test_byok_openai_realtime_mllm_params() -> None: + agent = Agent(name="t").with_mllm( + OpenAIRealtime(api_key="realtime-key", model="gpt-4o-realtime-preview", voice="coral") + ) + props = build_properties(agent) + assert props["mllm"]["vendor"] == "openai" + assert props["mllm"]["api_key"] == "realtime-key" + assert props["mllm"]["params"]["model"] == "gpt-4o-realtime-preview" + assert props["mllm"]["params"]["voice"] == "coral" + + +def test_byok_gemini_live_mllm_params() -> None: + agent = Agent(name="t").with_mllm( + GeminiLive(api_key="gemini-key", model="gemini-live-2.5-flash") + ) + props = build_properties(agent) + assert props["mllm"]["vendor"] == "gemini" + assert props["mllm"]["api_key"] == "gemini-key" + assert props["mllm"]["params"]["model"] == "gemini-live-2.5-flash" + + +def test_byok_vertex_ai_mllm_params() -> None: + agent = Agent(name="t").with_mllm( + VertexAI( + project_id="my-project", + location="us-central1", + adc_credentials_string="{}", + model="gemini-live-2.5-flash", + ) + ) + props = build_properties(agent) + assert props["mllm"]["vendor"] == "vertexai" + assert props["mllm"]["project_id"] == "my-project" + assert props["mllm"]["location"] == "us-central1" + assert props["mllm"]["adc_credentials_string"] == "{}" + assert props["mllm"]["params"]["model"] == "gemini-live-2.5-flash" + + +def test_byok_xai_grok_mllm_params() -> None: + agent = Agent(name="t").with_mllm(XaiGrok(api_key="xai-key")) + props = build_properties(agent) + assert props["mllm"]["vendor"] == "xai" + assert props["mllm"]["api_key"] == "xai-key" + + +# =========================================================================== +# Preset Coverage Matrix +# =========================================================================== + + +def test_preset_deepgram_nova_2_inferred() -> None: + tts = MiniMaxTTS(model="speech_2_8_turbo", voice_id="voice") + preset, properties = resolve_session_presets(None, {"asr": DeepgramSTT(model="nova-2", language="en").to_config(), "tts": tts.to_config()}) + assert preset is not None and "deepgram_nova_2" in preset + + +def test_preset_deepgram_nova_3_inferred() -> None: + tts = MiniMaxTTS(model="speech_2_8_turbo", voice_id="voice") + preset, properties = resolve_session_presets(None, {"asr": DeepgramSTT(model="nova-3", language="en").to_config(), "tts": tts.to_config()}) + assert preset is not None and "deepgram_nova_3" in preset + + +def test_preset_openai_gpt_4o_mini_inferred() -> None: + tts = MiniMaxTTS(model="speech_2_8_turbo", voice_id="voice") + preset, properties = resolve_session_presets(None, {"llm": OpenAI(model="gpt-4o-mini").to_config(), "tts": tts.to_config()}) + assert preset is not None and "openai_gpt_4o_mini" in preset + + +def test_preset_openai_tts_1_inferred() -> None: + preset, properties = resolve_session_presets(None, {"tts": OpenAITTS(voice="alloy").to_config()}) + assert preset == "openai_tts_1" + assert properties["tts"]["vendor"] == "openai" + + +def test_preset_minimax_speech_2_8_turbo_inferred() -> None: + preset, properties = resolve_session_presets(None, {"tts": MiniMaxTTS(model="speech_2_8_turbo", voice_id="voice").to_config()}) + assert preset == "minimax_speech_2_8_turbo" + + +def test_preset_minimax_speech_2_6_turbo_inferred() -> None: + preset, properties = resolve_session_presets(None, {"tts": MiniMaxTTS(model="speech-2.6-turbo", voice_id="voice").to_config()}) + assert preset == "minimax_speech_2_6_turbo" diff --git a/tests/custom/test_tts_vendors.py b/tests/custom/test_tts_vendors.py index bdd9482..7d2739b 100644 --- a/tests/custom/test_tts_vendors.py +++ b/tests/custom/test_tts_vendors.py @@ -14,8 +14,8 @@ def test_tts_vendor_params_match_generated_core_shapes() -> None: assert GoogleTTS(key="{}", voice_name="en-US-JennyNeural", language_code="en-US", sample_rate_hertz=24000).to_config()["params"] == { "credentials": "{}", - "VoiceSelectionParams": {"name": "en-US-JennyNeural", "language_code": "en-US"}, - "AudioConfig": {"sample_rate_hertz": 24000}, + "voice_selection_params": {"name": "en-US-JennyNeural", "language_code": "en-US"}, + "audio_config": {"sample_rate_hertz": 24000}, } assert CartesiaTTS(api_key="cartesia-key", voice_id="voice", model_id="sonic-2", sample_rate=24000).to_config()["params"] == { @@ -28,7 +28,7 @@ def test_tts_vendor_params_match_generated_core_shapes() -> None: assert RimeTTS(key="rime-key", speaker="speaker", model_id="mist").to_config()["params"] == { "api_key": "rime-key", "speaker": "speaker", - "modelId": "mist", + "model_id": "mist", } assert FishAudioTTS(key="fish-key", reference_id="ref", backend="speech-1.5").to_config()["params"] == { From 87585c5ce6a49bab54b9b984b6785e5f8b29119b Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Fri, 5 Jun 2026 11:21:31 -0400 Subject: [PATCH 09/14] aligned vendor structs with expected params.keys names --- src/agora_agent/agentkit/presets.py | 2 + src/agora_agent/agentkit/vendors/llm.py | 7 +-- src/agora_agent/agentkit/vendors/tts.py | 8 ++-- tests/custom/test_request_body.py | 58 ++++++++++++++++++++++++- tests/custom/test_tts_vendors.py | 41 +++++++++++++++++ 5 files changed, 105 insertions(+), 11 deletions(-) diff --git a/src/agora_agent/agentkit/presets.py b/src/agora_agent/agentkit/presets.py index 4d3409d..f160cee 100644 --- a/src/agora_agent/agentkit/presets.py +++ b/src/agora_agent/agentkit/presets.py @@ -187,6 +187,8 @@ def strip_inferred_preset_fields(properties: typing.Dict[str, typing.Any], infer params["url"] = None tts = {k: v for k, v in {**tts, "params": _omit_none(params)}.items() if v is not None} tts.pop("_minimax_preset_model", None) + if tts and "_minimax_preset_model" in tts: + tts = {k: v for k, v in tts.items() if k != "_minimax_preset_model"} return {**properties, "asr": asr, "llm": llm, "tts": tts} diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py index 443b5cd..5a9f39e 100644 --- a/src/agora_agent/agentkit/vendors/llm.py +++ b/src/agora_agent/agentkit/vendors/llm.py @@ -382,12 +382,7 @@ def to_config(self) -> Dict[str, Any]: f"{self.options.project_id}/locations/{self.options.location}/" f"publishers/google/models/{self.options.model}:streamGenerateContent?alt=sse" ) - config = Gemini(**options).to_config() - params = dict(config.get("params") or {}) - params["project_id"] = self.options.project_id - params["location"] = self.options.location - config["params"] = params - return config + return Gemini(**options).to_config() class AmazonBedrockOptions(BaseModel): diff --git a/src/agora_agent/agentkit/vendors/tts.py b/src/agora_agent/agentkit/vendors/tts.py index 0a7b699..acfec78 100644 --- a/src/agora_agent/agentkit/vendors/tts.py +++ b/src/agora_agent/agentkit/vendors/tts.py @@ -214,13 +214,13 @@ def sample_rate(self) -> Optional[int]: def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { "credentials": self.options.key, - "voice_selection_params": {"name": self.options.voice_name}, + "VoiceSelectionParams": {"name": self.options.voice_name}, } if self.options.language_code is not None: - params["voice_selection_params"]["language_code"] = self.options.language_code + params["VoiceSelectionParams"]["language_code"] = self.options.language_code if self.options.sample_rate_hertz is not None: - params["audio_config"] = {"sample_rate_hertz": self.options.sample_rate_hertz} + params["AudioConfig"] = {"sample_rate_hertz": self.options.sample_rate_hertz} result: Dict[str, Any] = {"vendor": "google", "params": params} if self.options.skip_patterns is not None: @@ -359,7 +359,7 @@ def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { "api_key": self.options.key, "speaker": self.options.speaker, - "model_id": self.options.model_id, + "modelId": self.options.model_id, } if self.options.base_url is not None: params["base_url"] = self.options.base_url diff --git a/tests/custom/test_request_body.py b/tests/custom/test_request_body.py index bf985f0..f4455f4 100644 --- a/tests/custom/test_request_body.py +++ b/tests/custom/test_request_body.py @@ -464,6 +464,49 @@ def test_7c_pipeline_id_empty_properties_no_vendors() -> None: assert "tts" not in properties +def test_7d_pipeline_id_with_byok_tts_only() -> None: + """7d: pipeline_id present, TTS-only BYOK override — ASR and LLM absent from properties.""" + agent = Agent(name="support", pipeline_id="studio-pipeline").with_tts( + ElevenLabsTTS( + key="el-key", + model_id="eleven_flash_v2_5", + voice_id="some-voice", + base_url="wss://api.elevenlabs.io/v1", + ) + ) + + call = start_session(agent) + assert call["pipeline_id"] == "studio-pipeline" + properties = dump(call["properties"]) + assert "asr" not in properties + assert "llm" not in properties + assert properties["tts"]["vendor"] == "elevenlabs" + assert properties["tts"]["params"]["key"] == "el-key" + + +def test_7e_pipeline_id_with_byok_asr_and_tts() -> None: + """7e: pipeline_id present, ASR+TTS BYOK overrides — LLM absent from properties.""" + agent = ( + Agent(name="support", pipeline_id="studio-pipeline") + .with_stt(DeepgramSTT(api_key="dg-key", language="en")) + .with_tts( + ElevenLabsTTS( + key="el-key", + model_id="eleven_flash_v2_5", + voice_id="some-voice", + base_url="wss://api.elevenlabs.io/v1", + ) + ) + ) + + call = start_session(agent) + assert call["pipeline_id"] == "studio-pipeline" + properties = dump(call["properties"]) + assert "llm" not in properties + assert properties["asr"]["vendor"] == "deepgram" + assert properties["tts"]["vendor"] == "elevenlabs" + + # =========================================================================== # Scenario 8 — MLLM mode # =========================================================================== @@ -872,10 +915,11 @@ def test_byok_sarvam_tts_params() -> None: def test_byok_murf_tts_params() -> None: - agent = Agent(name="t").with_tts(MurfTTS(key="murf-key")) + agent = Agent(name="t").with_tts(MurfTTS(key="murf-key", voice_id="Ariana")) props = build_properties(agent, allow_missing={"asr", "llm"}) assert props["tts"]["vendor"] == "murf" assert props["tts"]["params"]["api_key"] == "murf-key" + assert props["tts"]["params"]["voiceId"] == "Ariana" # --------------------------------------------------------------------------- @@ -965,3 +1009,15 @@ def test_preset_minimax_speech_2_8_turbo_inferred() -> None: def test_preset_minimax_speech_2_6_turbo_inferred() -> None: preset, properties = resolve_session_presets(None, {"tts": MiniMaxTTS(model="speech-2.6-turbo", voice_id="voice").to_config()}) assert preset == "minimax_speech_2_6_turbo" + + +def test_explicit_minimax_preset_strips_internal_hint() -> None: + """Explicit MiniMax TTS preset must not leak _minimax_preset_model to the wire.""" + # When the caller supplies the preset explicitly, inference is skipped but the + # internal _minimax_preset_model hint set by MiniMaxTTS.to_config() must still + # be removed before the POST body is sent. + tts_config = MiniMaxTTS(model="speech_2_8_turbo", voice_id="voice").to_config() + assert "_minimax_preset_model" in tts_config # confirm the hint is set pre-strip + + _, properties = resolve_session_presets("minimax_speech_2_8_turbo", {"tts": tts_config}) + assert "_minimax_preset_model" not in properties["tts"] diff --git a/tests/custom/test_tts_vendors.py b/tests/custom/test_tts_vendors.py index 7d2739b..b902bda 100644 --- a/tests/custom/test_tts_vendors.py +++ b/tests/custom/test_tts_vendors.py @@ -1,6 +1,7 @@ import pytest from agora_agent import AmazonTTS, CartesiaTTS, DeepgramTTS, ElevenLabsTTS, FishAudioTTS, GoogleTTS, HumeAITTS, MiniMaxTTS, MurfTTS, OpenAITTS, RimeTTS, SarvamTTS +from agora_agent.agents.types.start_agents_request_properties import StartAgentsRequestProperties def test_tts_vendor_params_match_generated_core_shapes() -> None: @@ -116,3 +117,43 @@ def test_tts_managed_mode_validation_matches_core_shapes() -> None: with pytest.raises(Exception, match="MiniMaxTTS requires key unless using a supported Agora-managed model"): MiniMaxTTS(model="unsupported-model") + + +def test_tts_wire_serialization_applies_fern_aliases() -> None: + """Verify alias-sensitive TTS params reach the wire with the correct Fern aliases. + + The intermediate to_config() / build_properties() helpers return snake_case + field names. The real POST body goes through StartAgentsRequestProperties → + .dict(by_alias=True) → convert_and_respect_annotation_metadata(direction='write'), + which is what jsonable_encoder calls in the live HTTP client. These tests + exercise that full chain so a Fern alias regression would be caught. + """ + _BASE = dict(channel="ch", token="tok", agent_rtc_uid="1", remote_rtc_uids=["100"]) + + # Google TTS: voice_selection_params and audio_config must arrive as PascalCase aliases + google_config = GoogleTTS( + key="{}", voice_name="en-US-JennyNeural", language_code="en-US", sample_rate_hertz=24000 + ).to_config() + assert "voice_selection_params" in google_config["params"] # pre-condition: to_config emits snake_case + google_wire = StartAgentsRequestProperties(**_BASE, tts=google_config).dict(by_alias=True) + google_params = google_wire["tts"]["params"] + assert "VoiceSelectionParams" in google_params, f"wire missing VoiceSelectionParams, got: {list(google_params)}" + assert "voice_selection_params" not in google_params + assert "AudioConfig" in google_params + assert "audio_config" not in google_params + + # Rime TTS: model_id must arrive as modelId alias + rime_config = RimeTTS(key="rime-key", speaker="speaker", model_id="mist").to_config() + assert "model_id" in rime_config["params"] # pre-condition: to_config emits snake_case + rime_wire = StartAgentsRequestProperties(**_BASE, tts=rime_config).dict(by_alias=True) + rime_params = rime_wire["tts"]["params"] + assert "modelId" in rime_params, f"wire missing modelId, got: {list(rime_params)}" + assert "model_id" not in rime_params + + # Murf TTS: voiceId (emitted by to_config as alias) must survive through wire serialization + murf_config = MurfTTS(key="murf-key", voice_id="Ariana").to_config() + assert "voiceId" in murf_config["params"] # to_config currently emits alias directly + murf_wire = StartAgentsRequestProperties(**_BASE, tts=murf_config).dict(by_alias=True) + murf_params = murf_wire["tts"]["params"] + assert "voiceId" in murf_params, f"wire missing voiceId, got: {list(murf_params)}" + assert murf_params["voiceId"] == "Ariana" From fecdc77c866f433d8287fcb8a55328612e016b21 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Fri, 5 Jun 2026 13:17:42 -0400 Subject: [PATCH 10/14] Fix AgentKit request validation and provider wire-key coverage --- PYTHON-AGENTKIT-SNAKE-CASE-AUDIT.md | 27 +++ src/agora_agent/agentkit/agent.py | 14 +- src/agora_agent/agentkit/agent_session.py | 59 +++++-- tests/custom/test_docs_snake_case.py | 94 +++++++++++ tests/custom/test_llm_vendors.py | 6 +- tests/custom/test_request_body.py | 196 +++++++++++++++++++++- tests/custom/test_stt_language.py | 26 +++ tests/custom/test_tts_vendors.py | 40 +++-- 8 files changed, 420 insertions(+), 42 deletions(-) create mode 100644 PYTHON-AGENTKIT-SNAKE-CASE-AUDIT.md create mode 100644 tests/custom/test_docs_snake_case.py diff --git a/PYTHON-AGENTKIT-SNAKE-CASE-AUDIT.md b/PYTHON-AGENTKIT-SNAKE-CASE-AUDIT.md new file mode 100644 index 0000000..f3cd64a --- /dev/null +++ b/PYTHON-AGENTKIT-SNAKE-CASE-AUDIT.md @@ -0,0 +1,27 @@ +# Python AgentKit Snake Case API Audit + +Scope: `agora-agents-python` public AgentKit wrappers, docs, and tests. + +Search terms: + +```bash +rg -n "apiKey|baseUrl|modelId|voiceId|groupId|keyTerm|turnDetection|inputAudioTranscription|greetingMessage|failureMessage|projectId|adcCredentialsString|sampleRate|targetLanguageCode|resourceName|deploymentName" agora-agents-python +``` + +## Result + +No shipped camelCase public Python constructor kwargs were found in source or docs examples. No deprecated alias helper is required for this pass. + +| File | Class / symbol | Public arg or example | Current spelling | Desired Python spelling | `to_config()` key | Wire key | Action | Compatibility needed | Test coverage | +|---|---|---|---|---|---|---|---|---|---| +| `src/agora_agent/agentkit/vendors/tts.py` | `GoogleTTS` | constructor arg | `voice_name` | `voice_name` | `params.VoiceSelectionParams` | `params.VoiceSelectionParams` | keep | no | `tests/custom/test_tts_vendors.py` | +| `src/agora_agent/agentkit/vendors/tts.py` | `RimeTTS` | constructor arg | `model_id` | `model_id` | `params.modelId` | `params.modelId` | keep | no | `tests/custom/test_tts_vendors.py` | +| `src/agora_agent/agentkit/vendors/tts.py` | `MurfTTS` | constructor arg | `voice_id` | `voice_id` | `params.voiceId` | `params.voiceId` | keep | no | `tests/custom/test_tts_vendors.py`, `tests/custom/test_request_body.py` | +| `src/agora_agent/types/rime_tts_params.py` | generated model | generated alias | `modelId` | n/a | `model_id` | `modelId` | keep | no | `tests/custom/test_tts_vendors.py` | +| `src/agora_agent/types/murf_tts_params.py` | generated model | generated alias | `voiceId` | n/a | `voice_id` | `voiceId` | keep | no | `tests/custom/test_tts_vendors.py` | +| `tests/custom/test_request_body.py` | wire assertion | payload key | `voiceId` | n/a | `params.voiceId` | `params.voiceId` | keep | no | request-body test | +| `tests/custom/test_tts_vendors.py` | wire assertion | payload key | `modelId`, `voiceId`, `VoiceSelectionParams` | n/a | generated model fields | wire aliases | keep | no | wire serialization test | + +## Guardrail Added + +`tests/custom/test_docs_snake_case.py` scans Python markdown code fences and fails on common camelCase kwargs such as `apiKey`, `baseUrl`, `modelId`, `voiceId`, `projectId`, and `greetingMessage`. JSON, TypeScript, Go, shell, and YAML examples are skipped so wire payload examples can retain required non-Python keys. diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index 7447d0b..992945c 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -76,6 +76,7 @@ from ..agent_management.types.agent_think_agent_management_response import ( AgentThinkAgentManagementResponse, ) +from ..core.pydantic_utilities import parse_obj_as from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS # Top-level aliases @@ -188,6 +189,13 @@ class SessionOptions(typing_extensions.TypedDict, total=False): debug: bool warn: typing.Callable[[str], None] + +def _start_properties_from_mapping( + properties: typing.Mapping[str, typing.Any], +) -> StartAgentsRequestProperties: + return parse_obj_as(StartAgentsRequestProperties, dict(properties)) + + # LLM sub-type aliases LlmGreetingConfigs = typing.Dict[str, typing.Any] LlmGreetingConfigsMode = typing.Any @@ -896,7 +904,7 @@ def to_properties( if self._failure_message is not None: mllm_config.setdefault("failure_message", self._failure_message) base_kwargs["mllm"] = mllm_config - return StartAgentsRequestProperties(**base_kwargs) + return _start_properties_from_mapping(base_kwargs) if skip_vendor_validation: warnings.warn( @@ -925,7 +933,7 @@ def to_properties( base_kwargs["turn_detection"] = turn_detection_config if skip_vendor_validation: - return StartAgentsRequestProperties(**base_kwargs) + return _start_properties_from_mapping(base_kwargs) if self._tts is None and not (skip_tts_validation or allow_missing_tts): raise ValueError("TTS configuration is required. Use with_tts() to set it.") @@ -938,7 +946,7 @@ def to_properties( if self._tts is not None and not skip_tts_validation: base_kwargs["tts"] = self._tts - return StartAgentsRequestProperties(**base_kwargs) + return _start_properties_from_mapping(base_kwargs) def _resolve_llm_config(self) -> typing.Dict[str, typing.Any]: llm_config = dict(self._llm or {}) diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py index 7c23232..8eb9810 100644 --- a/src/agora_agent/agentkit/agent_session.py +++ b/src/agora_agent/agentkit/agent_session.py @@ -15,8 +15,7 @@ AgentThinkAgentManagementResponse as AgentThinkResponse, ) from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse -from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties -from .agent import Agent, GetTurnsOptions, SayOptions, ThinkOptions +from .agent import Agent, GetTurnsOptions, SayOptions, ThinkOptions, _start_properties_from_mapping from .avatar_types import ( is_akool_avatar, is_anam_avatar, @@ -350,6 +349,44 @@ def _build_start_properties( return properties + @staticmethod + def _request_properties_for_start( + resolved_properties: typing.Dict[str, typing.Any], + *, + resolved_preset: typing.Optional[str], + pipeline_id: typing.Optional[str], + ) -> typing.Any: + try: + return _start_properties_from_mapping(resolved_properties) + except Exception as exc: + if pipeline_id: + return resolved_properties + if resolved_preset: + preset_categories = { + category + for item in normalize_preset_input(resolved_preset).split(",") + for category in [get_preset_category(item)] + if category is not None + } + error_categories = _AgentSessionBase._validation_error_categories(exc) + if error_categories and error_categories.issubset(preset_categories): + return resolved_properties + raise + + @staticmethod + def _validation_error_categories(exc: Exception) -> typing.Set[str]: + errors = getattr(exc, "errors", None) + if not callable(errors): + return set() + categories: typing.Set[str] = set() + for error in errors(): + loc = error.get("loc") if isinstance(error, dict) else None + if isinstance(loc, tuple) and loc: + field = loc[0] + if field in {"asr", "llm", "tts"}: + categories.add(typing.cast(str, field)) + return categories + def _vendor_validation_categories( self, pipeline_id: typing.Optional[str], @@ -514,10 +551,11 @@ def start(self) -> str: "properties": resolved_properties, }) - try: - request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) - except Exception: - request_properties = resolved_properties + request_properties = self._request_properties_for_start( + resolved_properties, + resolved_preset=resolved_preset, + pipeline_id=pipeline_id, + ) response = self._client.agents.start( self._app_id, @@ -841,10 +879,11 @@ async def start(self) -> str: "properties": resolved_properties, }) - try: - request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) - except Exception: - request_properties = resolved_properties + request_properties = self._request_properties_for_start( + resolved_properties, + resolved_preset=resolved_preset, + pipeline_id=pipeline_id, + ) response = await self._client.agents.start( self._app_id, diff --git a/tests/custom/test_docs_snake_case.py b/tests/custom/test_docs_snake_case.py new file mode 100644 index 0000000..ee08043 --- /dev/null +++ b/tests/custom/test_docs_snake_case.py @@ -0,0 +1,94 @@ +from __future__ import annotations + +import re +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[2] + +SCANNED_MARKDOWN = [ + ROOT / "README.md", + *sorted((ROOT / "docs").rglob("*.md")), +] + +SKIP_LANGS = { + "bash", + "console", + "go", + "javascript", + "js", + "json", + "shell", + "sh", + "text", + "ts", + "typescript", + "yaml", + "yml", +} + +PYTHON_HINTS = ( + "from agora_agent", + "import agora_agent", + "Agent(", + "OpenAI(", + "OpenAITTS(", + "OpenAISTT(", + "MiniMaxTTS(", + "DeepgramSTT(", + "GoogleTTS(", + "RimeTTS(", + "VertexAI(", + "VertexAILLM(", +) + +BLOCKED_TERMS = { + "apiKey": "api_key", + "baseUrl": "base_url", + "modelId": "model_id", + "voiceId": "voice_id", + "groupId": "group_id", + "projectId": "project_id", + "resourceName": "resource_name", + "deploymentName": "deployment_name", + "inputAudioTranscription": "input_audio_transcription", + "greetingMessage": "greeting_message", + "failureMessage": "failure_message", + "turnDetection": "turn_detection", + "adcCredentialsString": "adc_credentials_string", + "sampleRate": "sample_rate", + "targetLanguageCode": "target_language_code", +} + +FENCE_RE = re.compile(r"^```(?P[^\n`]*)\n(?P.*?)(?:^```)", re.MULTILINE | re.DOTALL) + + +def _should_scan(lang: str, body: str) -> bool: + lang_parts = lang.strip().split(maxsplit=1) + normalized = lang_parts[0].lower() if lang_parts else "" + if normalized in {"python", "py"}: + return True + if normalized in SKIP_LANGS: + return False + if normalized: + return False + return any(hint in body for hint in PYTHON_HINTS) + + +def test_python_docs_examples_use_snake_case_kwargs() -> None: + failures: list[str] = [] + + for path in SCANNED_MARKDOWN: + text = path.read_text() + for match in FENCE_RE.finditer(text): + body = match.group("body") + if not _should_scan(match.group("lang"), body): + continue + + line_offset = text[: match.start("body")].count("\n") + for term, replacement in BLOCKED_TERMS.items(): + for term_match in re.finditer(rf"\b{re.escape(term)}\b", body): + line = line_offset + body[: term_match.start()].count("\n") + 1 + failures.append(f"{path.relative_to(ROOT)}:{line}: use {replacement} instead of {term}") + + assert not failures, "CamelCase kwargs found in Python docs examples:\n" + "\n".join(failures) diff --git a/tests/custom/test_llm_vendors.py b/tests/custom/test_llm_vendors.py index 2861e45..0d07cf4 100644 --- a/tests/custom/test_llm_vendors.py +++ b/tests/custom/test_llm_vendors.py @@ -63,8 +63,10 @@ def test_vertex_ai_llm_includes_project_routing() -> None: assert config["api_key"] == "vertex-token" assert config["style"] == "gemini" assert config["params"]["model"] == "gemini-2.0-flash" - assert config["params"]["project_id"] == "project" - assert config["params"]["location"] == "us-central1" + assert "project" in config["url"] + assert "us-central1" in config["url"] + assert "project_id" not in config.get("params", {}) + assert "location" not in config.get("params", {}) def test_amazon_bedrock_serializes_as_bedrock_style() -> None: diff --git a/tests/custom/test_request_body.py b/tests/custom/test_request_body.py index f4455f4..52c8875 100644 --- a/tests/custom/test_request_body.py +++ b/tests/custom/test_request_body.py @@ -78,6 +78,12 @@ def dump(value): return value +def dump_wire(value): + if hasattr(value, "dict"): + return value.dict(by_alias=True) + return dump(value) + + # --------------------------------------------------------------------------- # Pattern 1: FakeAgentsClient — captures the full start() call # --------------------------------------------------------------------------- @@ -96,6 +102,15 @@ def start(self, appid, **kwargs): return StartResponse() +class FakeAsyncAgentsClient: + def __init__(self): + self.calls = [] + + async def start(self, appid, **kwargs): + self.calls.append({"appid": appid, **kwargs}) + return StartResponse() + + class FakeClient: app_id = "appid" app_certificate = None @@ -119,6 +134,51 @@ def start_session(agent, **session_kwargs): return agents.calls[0] +async def start_async_session(agent, **session_kwargs): + """Start async agent session via FakeAsyncAgentsClient and return the captured call dict.""" + agents = FakeAsyncAgentsClient() + client = FakeClient(agents) + await agent.create_async_session( + client=client, + channel="channel", + token="test-token", + agent_uid="1", + remote_uids=["100"], + **session_kwargs, + ).start() + return agents.calls[0] + + +def full_agent_with_tts(tts): + return ( + Agent(name="support") + .with_stt(DeepgramSTT(api_key="dg-key", model="nova-2", language="en")) + .with_llm( + OpenAI( + api_key="openai-key", + base_url="https://api.openai.com/v1/chat/completions", + model="gpt-4o", + ) + ) + .with_tts(tts) + ) + + +def invalid_google_tts_properties(): + return { + "channel": "channel", + "token": "test-token", + "agent_rtc_uid": "1", + "remote_rtc_uids": ["100"], + "tts": { + "vendor": "google", + "params": { + "credentials": "{}", + }, + }, + } + + # --------------------------------------------------------------------------- # Pattern 2: _build_start_properties — properties-only shape # --------------------------------------------------------------------------- @@ -157,6 +217,48 @@ def build_properties(agent, allow_missing=None): ) +def test_request_properties_validation_raises_without_preset_or_pipeline() -> None: + with pytest.raises(Exception): + AgentSession._request_properties_for_start( # noqa: SLF001 + invalid_google_tts_properties(), + resolved_preset=None, + pipeline_id=None, + ) + + +def test_request_properties_validation_fallback_allows_preset_partial_config() -> None: + properties = invalid_google_tts_properties() + + request_properties = AgentSession._request_properties_for_start( # noqa: SLF001 + properties, + resolved_preset="openai_tts_1", + pipeline_id=None, + ) + + assert request_properties is properties + + +def test_request_properties_validation_fallback_is_limited_to_preset_category() -> None: + with pytest.raises(Exception): + AgentSession._request_properties_for_start( # noqa: SLF001 + invalid_google_tts_properties(), + resolved_preset="openai_gpt_4o_mini", + pipeline_id=None, + ) + + +def test_request_properties_validation_fallback_allows_pipeline_partial_config() -> None: + properties = invalid_google_tts_properties() + + request_properties = AgentSession._request_properties_for_start( # noqa: SLF001 + properties, + resolved_preset=None, + pipeline_id="pipeline-id", + ) + + assert request_properties is properties + + # =========================================================================== # Scenario 1 — BYOK pipeline (full properties shape) # =========================================================================== @@ -268,7 +370,7 @@ def test_llm_config_greeting_wins_over_agent_level_greeting() -> None: def test_vertex_ai_llm_constructs_correct_url_and_params() -> None: - """VertexAILLM auto-constructs the aiplatform URL and injects project_id/location into params.""" + """VertexAILLM auto-constructs the aiplatform URL; project_id/location are URL-encoded, not in params.""" agent = Agent(name="support").with_llm( VertexAILLM( api_key="vertex-token", @@ -285,9 +387,9 @@ def test_vertex_ai_llm_constructs_correct_url_and_params() -> None: assert expected_url_fragment in llm["url"] assert "my-project" in llm["url"] assert llm["style"] == "gemini" - assert llm["params"]["project_id"] == "my-project" - assert llm["params"]["location"] == "us-central1" assert llm["params"]["model"] == "gemini-2.0-flash" + assert "project_id" not in llm["params"] + assert "location" not in llm["params"] # =========================================================================== @@ -835,8 +937,8 @@ def test_byok_google_tts_params() -> None: assert config["vendor"] == "google" p = config["params"] assert p["credentials"] == "{}" - assert p["voice_selection_params"]["name"] == "en-US-JennyNeural" - assert p["voice_selection_params"]["language_code"] == "en-US" + assert p["VoiceSelectionParams"]["name"] == "en-US-JennyNeural" + assert p["VoiceSelectionParams"]["language_code"] == "en-US" def test_byok_amazon_tts_params() -> None: @@ -876,7 +978,7 @@ def test_byok_rime_tts_params() -> None: assert config["vendor"] == "rime" assert config["params"]["api_key"] == "rime-key" assert config["params"]["speaker"] == "speaker" - assert config["params"]["model_id"] == "mist" + assert config["params"]["modelId"] == "mist" def test_byok_fishaudio_tts_params() -> None: @@ -922,6 +1024,88 @@ def test_byok_murf_tts_params() -> None: assert props["tts"]["params"]["voiceId"] == "Ariana" +def test_start_session_google_tts_preserves_wire_aliases() -> None: + agent = full_agent_with_tts( + GoogleTTS( + key="{}", + voice_name="en-US-JennyNeural", + language_code="en-US", + sample_rate_hertz=24000, + ) + ) + + call = start_session(agent) + properties = dump_wire(call["properties"]) + params = properties["tts"]["params"] + + assert params["VoiceSelectionParams"]["name"] == "en-US-JennyNeural" + assert params["VoiceSelectionParams"]["language_code"] == "en-US" + assert params["AudioConfig"]["sample_rate_hertz"] == 24000 + assert "voice_selection_params" not in params + assert "audio_config" not in params + + +def test_start_session_rime_tts_preserves_wire_aliases() -> None: + agent = full_agent_with_tts(RimeTTS(key="rime-key", speaker="speaker", model_id="mist")) + + call = start_session(agent) + properties = dump_wire(call["properties"]) + params = properties["tts"]["params"] + + assert params["modelId"] == "mist" + assert "model_id" not in params + + +def test_start_session_murf_tts_preserves_wire_aliases() -> None: + agent = full_agent_with_tts(MurfTTS(key="murf-key", voice_id="Ariana")) + + call = start_session(agent) + properties = dump_wire(call["properties"]) + params = properties["tts"]["params"] + + assert params["voiceId"] == "Ariana" + assert "voice_id" not in params + + +@pytest.mark.asyncio +async def test_async_start_session_google_tts_preserves_wire_aliases() -> None: + agent = full_agent_with_tts( + GoogleTTS( + key="{}", + voice_name="en-US-JennyNeural", + language_code="en-US", + sample_rate_hertz=24000, + ) + ) + + call = await start_async_session(agent) + properties = dump_wire(call["properties"]) + params = properties["tts"]["params"] + + assert params["VoiceSelectionParams"]["name"] == "en-US-JennyNeural" + assert params["VoiceSelectionParams"]["language_code"] == "en-US" + assert params["AudioConfig"]["sample_rate_hertz"] == 24000 + assert "voice_selection_params" not in params + assert "audio_config" not in params + + +def test_start_session_managed_minimax_tts_keeps_partial_preset_config() -> None: + agent = ( + Agent(name="support") + .with_llm(OpenAI(model="gpt-4o-mini")) + .with_tts(MiniMaxTTS(model="speech_2_8_turbo", voice_id="English_captivating_female1")) + ) + + call = start_session(agent) + properties = dump_wire(call["properties"]) + + assert "minimax_speech_2_8_turbo" in (call["preset"] or "") + assert properties["tts"]["vendor"] == "minimax" + assert properties["tts"]["params"] == { + "voice_setting": {"voice_id": "English_captivating_female1"}, + } + + # --------------------------------------------------------------------------- # BYOK Vendor Coverage Matrix — MLLM vendors # --------------------------------------------------------------------------- diff --git a/tests/custom/test_stt_language.py b/tests/custom/test_stt_language.py index 775a3ac..5a509ff 100644 --- a/tests/custom/test_stt_language.py +++ b/tests/custom/test_stt_language.py @@ -7,8 +7,10 @@ DeepgramSTT, ElevenLabsTTS, GoogleSTT, + MicrosoftSTT, OpenAI, OpenAISTT, + SarvamSTT, SpeechmaticsSTT, TurnDetectionConfig, ) @@ -98,6 +100,14 @@ def test_stt_vendor_params_match_documented_shapes() -> None: "language": "en", } + # api_key → wire key "key"; keyterm passes through unchanged + assert DeepgramSTT(api_key="dg-key", model="nova-3", language="en", keyterm="term").to_config()["params"] == { + "key": "dg-key", + "model": "nova-3", + "language": "en", + "keyterm": "term", + } + assert OpenAISTT( api_key="openai-key", model="gpt-4o-mini-transcribe", @@ -147,6 +157,22 @@ def test_stt_vendor_params_match_documented_shapes() -> None: "uri": "wss://example.test/ws", } + assert MicrosoftSTT(key="ms-key", region="eastus", language="en-US").to_config()["params"] == { + "key": "ms-key", + "region": "eastus", + "language": "en-US", + } + + assert SpeechmaticsSTT(api_key="sm-key", language="en").to_config()["params"] == { + "api_key": "sm-key", + "language": "en", + } + + assert SarvamSTT(api_key="sarvam-key", language="en-IN").to_config()["params"] == { + "api_key": "sarvam-key", + "language": "en-IN", + } + def test_assemblyai_params_stay_nested_and_asr_language_comes_from_turn_detection() -> None: props = properties( diff --git a/tests/custom/test_tts_vendors.py b/tests/custom/test_tts_vendors.py index b902bda..11e3f35 100644 --- a/tests/custom/test_tts_vendors.py +++ b/tests/custom/test_tts_vendors.py @@ -1,10 +1,18 @@ import pytest -from agora_agent import AmazonTTS, CartesiaTTS, DeepgramTTS, ElevenLabsTTS, FishAudioTTS, GoogleTTS, HumeAITTS, MiniMaxTTS, MurfTTS, OpenAITTS, RimeTTS, SarvamTTS +from agora_agent import AmazonTTS, CartesiaTTS, DeepgramTTS, ElevenLabsTTS, FishAudioTTS, GoogleTTS, HumeAITTS, MicrosoftTTS, MiniMaxTTS, MurfTTS, OpenAITTS, RimeTTS, SarvamTTS from agora_agent.agents.types.start_agents_request_properties import StartAgentsRequestProperties +from agora_agent.core.jsonable_encoder import jsonable_encoder +from agora_agent.core.pydantic_utilities import parse_obj_as def test_tts_vendor_params_match_generated_core_shapes() -> None: + assert MicrosoftTTS(key="ms-key", region="eastus", voice_name="en-US-JennyNeural").to_config()["params"] == { + "key": "ms-key", + "region": "eastus", + "voice_name": "en-US-JennyNeural", + } + assert AmazonTTS(access_key="access", secret_key="secret", region="us-east-1", voice_id="Joanna", engine="neural").to_config()["params"] == { "aws_access_key_id": "access", "aws_secret_access_key": "secret", @@ -15,8 +23,8 @@ def test_tts_vendor_params_match_generated_core_shapes() -> None: assert GoogleTTS(key="{}", voice_name="en-US-JennyNeural", language_code="en-US", sample_rate_hertz=24000).to_config()["params"] == { "credentials": "{}", - "voice_selection_params": {"name": "en-US-JennyNeural", "language_code": "en-US"}, - "audio_config": {"sample_rate_hertz": 24000}, + "VoiceSelectionParams": {"name": "en-US-JennyNeural", "language_code": "en-US"}, + "AudioConfig": {"sample_rate_hertz": 24000}, } assert CartesiaTTS(api_key="cartesia-key", voice_id="voice", model_id="sonic-2", sample_rate=24000).to_config()["params"] == { @@ -29,7 +37,7 @@ def test_tts_vendor_params_match_generated_core_shapes() -> None: assert RimeTTS(key="rime-key", speaker="speaker", model_id="mist").to_config()["params"] == { "api_key": "rime-key", "speaker": "speaker", - "model_id": "mist", + "modelId": "mist", } assert FishAudioTTS(key="fish-key", reference_id="ref", backend="speech-1.5").to_config()["params"] == { @@ -120,40 +128,30 @@ def test_tts_managed_mode_validation_matches_core_shapes() -> None: def test_tts_wire_serialization_applies_fern_aliases() -> None: - """Verify alias-sensitive TTS params reach the wire with the correct Fern aliases. - - The intermediate to_config() / build_properties() helpers return snake_case - field names. The real POST body goes through StartAgentsRequestProperties → - .dict(by_alias=True) → convert_and_respect_annotation_metadata(direction='write'), - which is what jsonable_encoder calls in the live HTTP client. These tests - exercise that full chain so a Fern alias regression would be caught. - """ + """Verify alias-sensitive TTS params keep the exact provider wire keys.""" _BASE = dict(channel="ch", token="tok", agent_rtc_uid="1", remote_rtc_uids=["100"]) - # Google TTS: voice_selection_params and audio_config must arrive as PascalCase aliases google_config = GoogleTTS( key="{}", voice_name="en-US-JennyNeural", language_code="en-US", sample_rate_hertz=24000 ).to_config() - assert "voice_selection_params" in google_config["params"] # pre-condition: to_config emits snake_case - google_wire = StartAgentsRequestProperties(**_BASE, tts=google_config).dict(by_alias=True) + assert "VoiceSelectionParams" in google_config["params"] + google_wire = jsonable_encoder(parse_obj_as(StartAgentsRequestProperties, {**_BASE, "tts": google_config})) google_params = google_wire["tts"]["params"] assert "VoiceSelectionParams" in google_params, f"wire missing VoiceSelectionParams, got: {list(google_params)}" assert "voice_selection_params" not in google_params assert "AudioConfig" in google_params assert "audio_config" not in google_params - # Rime TTS: model_id must arrive as modelId alias rime_config = RimeTTS(key="rime-key", speaker="speaker", model_id="mist").to_config() - assert "model_id" in rime_config["params"] # pre-condition: to_config emits snake_case - rime_wire = StartAgentsRequestProperties(**_BASE, tts=rime_config).dict(by_alias=True) + assert "modelId" in rime_config["params"] + rime_wire = jsonable_encoder(parse_obj_as(StartAgentsRequestProperties, {**_BASE, "tts": rime_config})) rime_params = rime_wire["tts"]["params"] assert "modelId" in rime_params, f"wire missing modelId, got: {list(rime_params)}" assert "model_id" not in rime_params - # Murf TTS: voiceId (emitted by to_config as alias) must survive through wire serialization murf_config = MurfTTS(key="murf-key", voice_id="Ariana").to_config() - assert "voiceId" in murf_config["params"] # to_config currently emits alias directly - murf_wire = StartAgentsRequestProperties(**_BASE, tts=murf_config).dict(by_alias=True) + assert "voiceId" in murf_config["params"] + murf_wire = jsonable_encoder(parse_obj_as(StartAgentsRequestProperties, {**_BASE, "tts": murf_config})) murf_params = murf_wire["tts"]["params"] assert "voiceId" in murf_params, f"wire missing voiceId, got: {list(murf_params)}" assert murf_params["voiceId"] == "Ariana" From c287be1cf51567d214970e044e6c1de74c60da2f Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Fri, 5 Jun 2026 13:27:23 -0400 Subject: [PATCH 11/14] Prepare Python SDK v2.2.0 release --- changelog.md | 20 ++++++++++++++++++++ compat/agora-agent-server-sdk/pyproject.toml | 4 ++-- pyproject.toml | 2 +- src/agora_agent/core/client_wrapper.py | 4 ++-- 4 files changed, 25 insertions(+), 5 deletions(-) diff --git a/changelog.md b/changelog.md index 9050db2..b04e213 100644 --- a/changelog.md +++ b/changelog.md @@ -4,6 +4,26 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/). +## [v2.2.0] — 2026-06-05 + +### Added + +- **Expanded provider surface** — Added generated API support for the latest Conversational AI vendors and configuration types, including Dify LLM and Generic Avatar. +- **STT interaction language fields** — Added `interaction_language` support across Speechmatics, Deepgram, Microsoft, Google, Amazon, AssemblyAI, and Sarvam STT wrappers. +- **Deepgram keyterm** — Added `keyterm` support on `DeepgramSTT`, serialized as `asr.params.keyterm`. + +### Changed + +- **MiniMax managed presets** — MiniMax preset-backed TTS now keeps the preset model as an internal hint while sending only supported partial TTS settings such as `voice_setting.voice_id`. +- **Vertex AI LLM routing** — `VertexAILLM` now keeps project and location in the generated endpoint URL instead of duplicating them in `llm.params`. + +### Fixed + +- **Provider wire keys** — Corrected alias-sensitive TTS payloads so Google TTS emits `VoiceSelectionParams` and `AudioConfig`, Rime TTS emits `modelId`, and Murf TTS preserves `voiceId`. +- **AgentKit request validation** — Start request validation now de-aliases REST-shaped provider dictionaries before constructing generated request models, while still allowing preset and pipeline-backed partial configs. +- **Request body coverage** — Added regression tests for BYOK, preset-backed, mixed preset/BYOK, and pipeline override request shapes across provider configurations. +- **Python docs examples** — Added a docs guard to keep Python examples on snake_case kwargs while allowing documented JSON wire keys. + ## [v2.1.0] — 2026-06-02 ### Added diff --git a/compat/agora-agent-server-sdk/pyproject.toml b/compat/agora-agent-server-sdk/pyproject.toml index eea45d7..078ac75 100644 --- a/compat/agora-agent-server-sdk/pyproject.toml +++ b/compat/agora-agent-server-sdk/pyproject.toml @@ -3,7 +3,7 @@ name = "agora-agent-server-sdk" [tool.poetry] name = "agora-agent-server-sdk" -version = "v2.1.1" +version = "v2.2.0" description = "Compatibility shim for the renamed agora-agents package." readme = "README.md" authors = [] @@ -35,7 +35,7 @@ Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-pyth [tool.poetry.dependencies] python = "^3.8" -agora-agents = ">=2.1.1,<3.0.0" +agora-agents = ">=2.2.0,<3.0.0" [build-system] requires = ["poetry-core"] diff --git a/pyproject.toml b/pyproject.toml index f1e9e04..327306a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "agora-agents" [tool.poetry] name = "agora-agents" -version = "v2.1.1" +version = "v2.2.0" description = "" readme = "README.md" authors = [] diff --git a/src/agora_agent/core/client_wrapper.py b/src/agora_agent/core/client_wrapper.py index 2df9814..ba5e462 100644 --- a/src/agora_agent/core/client_wrapper.py +++ b/src/agora_agent/core/client_wrapper.py @@ -26,10 +26,10 @@ def __init__( def get_headers(self) -> typing.Dict[str, str]: headers: typing.Dict[str, str] = { - "User-Agent": "agora-agents/v2.1.2", + "User-Agent": "agora-agents/v2.2.0", "X-Fern-Language": "Python", "X-Fern-SDK-Name": "agora-agents", - "X-Fern-SDK-Version": "v2.1.2", + "X-Fern-SDK-Version": "v2.2.0", **(self.get_custom_headers() or {}), } headers["Authorization"] = httpx.BasicAuth(self._get_username(), self._get_password())._auth_header From 7f4f9da864bd70bfdcf4d5ca0d2babd28076bf1d Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Fri, 5 Jun 2026 13:59:11 -0400 Subject: [PATCH 12/14] Fix Python release validation and provider docs --- changelog.md | 2 +- docs/concepts/vendors.md | 2 +- docs/reference/vendors.md | 2 +- src/agora_agent/agentkit/agent_session.py | 5 ++++- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/changelog.md b/changelog.md index b04e213..303d9fa 100644 --- a/changelog.md +++ b/changelog.md @@ -9,7 +9,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). ### Added - **Expanded provider surface** — Added generated API support for the latest Conversational AI vendors and configuration types, including Dify LLM and Generic Avatar. -- **STT interaction language fields** — Added `interaction_language` support across Speechmatics, Deepgram, Microsoft, Google, Amazon, AssemblyAI, and Sarvam STT wrappers. +- **Interaction language handling** — AgentKit now consistently derives REST `asr.language` from `turn_detection.language` while keeping provider-specific STT language values under `asr.params`. - **Deepgram keyterm** — Added `keyterm` support on `DeepgramSTT`, serialized as `asr.params.keyterm`. ### Changed diff --git a/docs/concepts/vendors.md b/docs/concepts/vendors.md index 2ec5439..217b77d 100644 --- a/docs/concepts/vendors.md +++ b/docs/concepts/vendors.md @@ -80,7 +80,7 @@ Use `turn_detection.language` for Agora interaction language; it defaults to `en | Class | Provider | Required Parameters | |---|---|---| | `SpeechmaticsSTT` | Speechmatics | `api_key`, `language` | -| `DeepgramSTT` | Deepgram | `model` for Agora-managed `nova-2`/`nova-3`; `api_key` for BYOK | +| `DeepgramSTT` | Deepgram | `model` for Agora-managed `nova-2`/`nova-3`; `api_key` for BYOK; `language?`, `keyterm?` | | `MicrosoftSTT` | Microsoft Azure | `key`, `region`, `language` | | `OpenAISTT` | OpenAI | `api_key` | | `GoogleSTT` | Google Cloud | `project_id`, `location`, `adc_credentials_string`, `language` | diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index c7d8ada..1ab8aeb 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -336,7 +336,7 @@ Use `turn_detection.language` for Agora interaction language; it defaults to `en | `api_key` | `str` | BYOK only | `None` | Deepgram API key. Optional only for Agora-managed `nova-2` and `nova-3`. | | `model` | `str` | No | `None` | Model (e.g., `nova-2`) | | `language` | `str` | No | `None` | Language code (e.g., `en-US`) | -| `interaction_language` | `str` | No | `None` | Agora `asr.language` override | +| `keyterm` | `str` | No | `None` | Boost specialized terms and brands; serialized as `asr.params.keyterm` | | `smart_format` | `bool` | No | `None` | Enable smart formatting | | `punctuation` | `bool` | No | `None` | Enable punctuation | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py index 8eb9810..2900c18 100644 --- a/src/agora_agent/agentkit/agent_session.py +++ b/src/agora_agent/agentkit/agent_session.py @@ -362,9 +362,12 @@ def _request_properties_for_start( if pipeline_id: return resolved_properties if resolved_preset: + normalized_preset = normalize_preset_input(resolved_preset) + if not normalized_preset: + raise preset_categories = { category - for item in normalize_preset_input(resolved_preset).split(",") + for item in normalized_preset.split(",") for category in [get_preset_category(item)] if category is not None } From 42d452acc691f843585add50f3257dd985d444c5 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Fri, 5 Jun 2026 14:10:30 -0400 Subject: [PATCH 13/14] Remove unused Python STT interaction language fields --- src/agora_agent/agentkit/vendors/stt.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/agora_agent/agentkit/vendors/stt.py b/src/agora_agent/agentkit/vendors/stt.py index 2a0c4c0..d390573 100644 --- a/src/agora_agent/agentkit/vendors/stt.py +++ b/src/agora_agent/agentkit/vendors/stt.py @@ -4,9 +4,6 @@ from .base import BaseSTT -# BCP-47 language tag for asr.language (the Agora interaction language). -InteractionLanguage = str - _DEEPGRAM_MANAGED_MODELS = {"nova-2", "nova-3"} @@ -15,7 +12,6 @@ class SpeechmaticsSTTOptions(BaseModel): api_key: str = Field(..., description="Speechmatics API key") language: str = Field(..., description="Language code (e.g., en, es, fr)") - interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") model: Optional[str] = Field(default=None, description="Model name") uri: Optional[str] = Field(default=None, description="Speechmatics streaming WebSocket URL") additional_params: Optional[Dict[str, Any]] = Field(default=None) @@ -49,7 +45,6 @@ class DeepgramSTTOptions(BaseModel): model: Optional[str] = Field(default=None, description="Model (e.g., nova-2, enhanced, base)") language: Optional[str] = Field(default=None, description="Language code (e.g., en-US)") keyterm: Optional[str] = Field(default=None, description="Boost specialized terms and brands for Deepgram") - interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") smart_format: Optional[bool] = Field(default=None, description="Enable smart formatting") punctuation: Optional[bool] = Field(default=None, description="Enable punctuation") additional_params: Optional[Dict[str, Any]] = Field(default=None) @@ -267,7 +262,6 @@ class SarvamSTTOptions(BaseModel): api_key: str = Field(..., description="Sarvam API key") language: str = Field(..., description="Language code (e.g., en, hi, ta)") - interaction_language: Optional[InteractionLanguage] = Field(default=None, description="Agora interaction language for asr.language") model: Optional[str] = Field(default=None, description="Model name") additional_params: Optional[Dict[str, Any]] = Field(default=None) From 474f1b2640d85515f1faa7eb585ada72ca0ad90d Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Fri, 5 Jun 2026 14:18:37 -0400 Subject: [PATCH 14/14] Clarify turn detection language validation errors --- src/agora_agent/agentkit/agent.py | 2 +- tests/custom/test_stt_language.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index 992945c..1daba82 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -306,7 +306,7 @@ def _is_turn_detection_language(value: typing.Any) -> bool: def _validate_turn_detection_language(value: typing.Any) -> TurnDetectionLanguage: if not _is_turn_detection_language(value): - raise ValueError(f"Invalid interaction language: {value}") + raise ValueError(f"Invalid turn_detection.language: {value}") return value # type: ignore[return-value] diff --git a/tests/custom/test_stt_language.py b/tests/custom/test_stt_language.py index 5a509ff..0ea3c7d 100644 --- a/tests/custom/test_stt_language.py +++ b/tests/custom/test_stt_language.py @@ -73,7 +73,7 @@ def test_turn_detection_language_can_differ_from_provider_language() -> None: def test_invalid_turn_detection_language_is_rejected() -> None: - with pytest.raises(ValueError, match="Invalid interaction language: xx"): + with pytest.raises(ValueError, match="Invalid turn_detection.language: xx"): properties(Agent(turn_detection=TurnDetectionConfig(language="xx"))) # type: ignore[arg-type]