From b8b75a942fa616afad63d1c532ba3c5862aabc5d Mon Sep 17 00:00:00 2001
From: dan-ince-aai <dince@assemblyai.com>
Date: Wed, 18 Feb 2026 17:42:37 +0000
Subject: [PATCH 1/7] s2s docs

---
 .../voice-agents/speechtospeech.mdx           | 781 ++++++++++++++++++
 1 file changed, 781 insertions(+)
 create mode 100644 fern/pages/02-speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx

diff --git a/fern/pages/02-speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx b/fern/pages/02-speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx
new file mode 100644
index 000000000..595d9fd7b
--- /dev/null
+++ b/fern/pages/02-speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx
@@ -0,0 +1,781 @@
+---
+title: "Speech-to-Speech"
+description: "Build real-time voice agents with a single WebSocket connection. Stream audio in, get intelligent spoken responses back."
+---
+
+import { AgentGenerator } from "../../../../assets/components/AgentGenerator";
+
+Build voice agents with a single WebSocket connection. Stream audio in, get intelligent spoken responses back — with built-in transcription, turn detection, and function calling. The API is compatible with the [OpenAI Realtime API](https://platform.openai.com/docs/guides/realtime), so you can use the OpenAI SDK or any OpenAI-compatible framework like LiveKit.
+
+## Quickstart
+
+Install dependencies and talk to your agent in under a minute.
+
+```bash
+pip install websockets sounddevice
+```
+
+```python
+import asyncio, base64, json, threading
+import sounddevice as sd
+import websockets
+
+API_KEY = "YOUR_ASSEMBLYAI_API_KEY"
+WS_URL = "wss://speech-to-speech.assemblyai.com/v1/realtime"
+SAMPLE_RATE = 24000
+
+
+class AudioPlayer:
+    """Buffers and plays PCM16 audio in real time."""
+    def __init__(self):
+        self._buf = bytearray()
+        self._lock = threading.Lock()
+        self._out = sd.RawOutputStream(
+            samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480, latency="low",
+        )
+        self._out.start()
+
+    def play(self, pcm: bytes):
+        with self._lock:
+            self._buf.extend(pcm)
+            while len(self._buf) >= 960:
+                self._out.write(bytes(self._buf[:960]))
+                del self._buf[:960]
+
+    def close(self):
+        self._out.stop()
+        self._out.close()
+
+
+async def main():
+    player = AudioPlayer()
+    q = asyncio.Queue()
+
+    def mic_cb(data, frames, ti, status):
+        q.put_nowait(bytes(data))
+
+    mic = sd.RawInputStream(
+        samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480,
+        callback=mic_cb, latency="low",
+    )
+    mic.start()
+
+    ws = await websockets.connect(WS_URL, additional_headers={"Authorization": f"Bearer {API_KEY}"})
+
+    await ws.send(json.dumps({"type": "session.update", "session": {
+        "input_audio_format": "pcm16", "input_audio_sample_rate": SAMPLE_RATE,
+        "output_audio_format": "pcm16", "output_audio_sample_rate": SAMPLE_RATE,
+        "input_audio_transcription": {"model": "universal-streaming"},
+        "turn_detection": {"type": "server_vad", "threshold": 0.5, "prefix_padding_ms": 300, "silence_duration_ms": 200},
+        "output_modalities": ["audio", "text"],
+        "instructions": "You are a helpful voice assistant. Keep responses brief.",
+        "voice": "sage",
+    }}))
+
+    async def stream_mic():
+        while True:
+            try:
+                pcm = await asyncio.wait_for(q.get(), timeout=0.1)
+                await ws.send(json.dumps({"type": "input_audio_buffer.append", "audio": base64.b64encode(pcm).decode()}))
+            except asyncio.TimeoutError:
+                pass
+
+    async def handle_events():
+        async for raw in ws:
+            e = json.loads(raw)
+            et = e.get("type", "")
+            if et == "response.output_audio.delta":
+                player.play(base64.b64decode(e["delta"]))
+            elif et == "response.output_audio_transcript.done":
+                print(f"Agent: {e.get('transcript', '')}")
+            elif et == "conversation.item.input_audio_transcription.completed":
+                print(f"You:   {e.get('transcript', '')}")
+
+    print("Listening — start talking.\n")
+    try:
+        await asyncio.gather(stream_mic(), handle_events())
+    except KeyboardInterrupt:
+        pass
+    finally:
+        mic.stop(); mic.close(); player.close(); await ws.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+Replace `YOUR_ASSEMBLYAI_API_KEY` with your key from the [AssemblyAI dashboard](https://www.assemblyai.com/dashboard/signup), run the script, and start talking.
+
+---
+
+## How it works
+
+```
+Client                                     Server
+  |                                           |
+  |--- WebSocket connect -------------------->|
+  |--- session.update (config) -------------->|
+  |--- input_audio_buffer.append ------------>|  stream mic audio
+  |                                           |
+  |<------------ session.created -------------|
+  |<------------ speech_started --------------|  user is talking
+  |<------------ speech_stopped --------------|  user finished
+  |<------------ transcription.completed -----|  what the user said
+  |<------------ response.audio.delta --------|  agent speaks back
+  |<------------ response.done ---------------|
+  |                                           |
+```
+
+1. **Connect** — Open a WebSocket to `wss://speech-to-speech.assemblyai.com/v1/realtime` with your API key in the `Authorization: Bearer` header.
+2. **Configure** — Send a `session.update` with your voice, instructions, turn detection settings, and any tools.
+3. **Stream audio** — Send base64-encoded PCM16 audio chunks. The server detects when the user starts and stops speaking.
+4. **Receive responses** — The server transcribes the user's speech, generates a response, and streams back audio and text in real time.
+
+The API is fully compatible with the OpenAI Realtime protocol, so the [OpenAI Python SDK](https://github.com/openai/openai-python), [LiveKit Agents](https://docs.livekit.io/agents/), and any OpenAI-compatible client work out of the box — just point them at `wss://speech-to-speech.assemblyai.com/v1`.
+
+---
+
+## Agent generator
+
+Describe your agent and we'll generate the complete code — system prompt, tool definitions, and a runnable script.
+
+<AgentGenerator />
+
+---
+
+## Configuration
+
+Configure your session by sending a `session.update` event after connecting. The API accepts two session formats depending on your integration approach.
+
+### Flat format (Raw WebSocket)
+
+```json
+{
+  "type": "session.update",
+  "session": {
+    "instructions": "You are a helpful voice assistant.",
+    "voice": "sage",
+    "input_audio_format": "pcm16",
+    "input_audio_sample_rate": 24000,
+    "output_audio_format": "pcm16",
+    "output_audio_sample_rate": 24000,
+    "input_audio_transcription": {"model": "universal-streaming"},
+    "output_modalities": ["audio", "text"],
+    "turn_detection": {
+      "type": "server_vad",
+      "threshold": 0.5,
+      "prefix_padding_ms": 300,
+      "silence_duration_ms": 200,
+      "create_response": true
+    },
+    "tools": [],
+    "tool_choice": "auto"
+  }
+}
+```
+
+### Nested format (OpenAI SDK / LiveKit)
+
+The OpenAI GA SDK and LiveKit plugin use a nested session format.
+
+```json
+{
+  "type": "session.update",
+  "session": {
+    "instructions": "You are a helpful voice assistant.",
+    "output_modalities": ["audio", "text"],
+    "audio": {
+      "input": {
+        "format": {"type": "audio/pcm", "rate": 24000},
+        "transcription": {"model": "universal-streaming"},
+        "turn_detection": {
+          "type": "server_vad",
+          "threshold": 0.5,
+          "prefix_padding_ms": 300,
+          "silence_duration_ms": 200,
+          "create_response": true
+        }
+      },
+      "output": {
+        "format": {"type": "audio/pcm", "rate": 24000},
+        "voice": "sage"
+      }
+    },
+    "tools": [],
+    "tool_choice": "auto"
+  }
+}
+```
+
+### Session parameters
+
+<ParamField path="instructions" type="string">
+  System prompt for the AI agent. Defines personality, behavior, and constraints.
+</ParamField>
+
+<ParamField path="voice" type="string" default="sage">
+  Voice for agent audio responses. One of: `sage`, `ember`, `breeze`, `cascade`.
+</ParamField>
+
+<ParamField path="input_audio_format" type="string" default="pcm16">
+  Input audio encoding. Use `pcm16` (signed 16-bit little-endian).
+</ParamField>
+
+<ParamField path="input_audio_sample_rate" type="integer" default="24000">
+  Input audio sample rate in Hz.
+</ParamField>
+
+<ParamField path="output_audio_format" type="string" default="pcm16">
+  Output audio encoding. Use `pcm16` (signed 16-bit little-endian).
+</ParamField>
+
+<ParamField path="output_audio_sample_rate" type="integer" default="24000">
+  Output audio sample rate in Hz.
+</ParamField>
+
+<ParamField path="output_modalities" type="array">
+  What the agent returns. Include `"audio"` for spoken responses and `"text"` for transcripts.
+</ParamField>
+
+<ParamField path="input_audio_transcription" type="object">
+  Enables real-time transcription of user speech. Set `model` to `"universal-streaming"`.
+</ParamField>
+
+<ParamField path="turn_detection" type="object">
+  Server-side voice activity detection. See [Turn detection](#turn-detection).
+</ParamField>
+
+<ParamField path="tools" type="array" default="[]">
+  Functions the agent can call. See [Tool calling](#tool-calling).
+</ParamField>
+
+<ParamField path="tool_choice" type="string" default="auto">
+  When to use tools. `"auto"` lets the model decide.
+</ParamField>
+
+### Audio format
+
+All audio is **PCM16** (signed 16-bit integer, little-endian), **mono**, **24,000 Hz**. Audio is base64-encoded inside JSON messages. Each chunk should be approximately 20 ms (480 samples, 960 bytes).
+
+### Voices
+
+| Voice | ID |
+|-------|----|
+| Sage | `sage` |
+| Ember | `ember` |
+| Breeze | `breeze` |
+| Cascade | `cascade` |
+
+### Turn detection
+
+The server automatically detects when the user starts and stops speaking using voice activity detection (VAD). When the user finishes a turn, the agent responds automatically.
+
+```json
+"turn_detection": {
+  "type": "server_vad",
+  "threshold": 0.5,
+  "prefix_padding_ms": 300,
+  "silence_duration_ms": 200,
+  "create_response": true
+}
+```
+
+<ParamField path="type" type="string" required>
+  Set to `"server_vad"` for server-side voice activity detection.
+</ParamField>
+
+<ParamField path="threshold" type="float" default="0.5">
+  Speech detection sensitivity (0.0 to 1.0). Lower values detect quieter speech.
+</ParamField>
+
+<ParamField path="prefix_padding_ms" type="integer" default="300">
+  Audio to preserve before speech onset, in milliseconds. Prevents clipping the start of a sentence.
+</ParamField>
+
+<ParamField path="silence_duration_ms" type="integer" default="200">
+  How long the user must pause before the server considers them done speaking, in milliseconds.
+</ParamField>
+
+<ParamField path="create_response" type="boolean" default="true">
+  Automatically generate an agent response when the user finishes speaking.
+</ParamField>
+
+---
+
+## Tool calling
+
+Give your agent the ability to call functions in your application — look up data, take actions, or call external APIs — then continue the conversation with the result.
+
+### Define tools in your session config
+
+```json
+"tools": [{
+  "type": "function",
+  "name": "get_weather",
+  "description": "Get the current weather for a location",
+  "parameters": {
+    "type": "object",
+    "properties": {
+      "location": {"type": "string", "description": "City name"}
+    },
+    "required": ["location"]
+  }
+}],
+"tool_choice": "auto"
+```
+
+### Handle tool calls
+
+When the agent decides to call a function, the server sends `response.function_call_arguments.done` while the response is still in progress. Start executing the function immediately — you don't need to wait. When `response.done` arrives, send the result back.
+
+```python
+pending_tasks = {}
+
+async for raw in ws:
+    e = json.loads(raw)
+    et = e.get("type", "")
+
+    if et == "response.function_call_arguments.done":
+        # Start executing immediately — don't wait for response.done
+        args = json.loads(e["arguments"])
+        pending_tasks[e["call_id"]] = asyncio.create_task(run_tool(e["name"], args))
+
+    elif et == "response.done" and pending_tasks:
+        # Response is complete — send back the results
+        for call_id, task in pending_tasks.items():
+            result = await task
+            await ws.send(json.dumps({
+                "type": "conversation.item.create",
+                "item": {
+                    "type": "function_call_output",
+                    "call_id": call_id,
+                    "output": json.dumps(result),
+                },
+            }))
+        pending_tasks.clear()
+
+    elif et == "response.output_audio.delta":
+        player.play(base64.b64decode(e["delta"]))
+```
+
+The pattern is: **receive the call** → **start executing immediately** → **send the result when `response.done` arrives**. Your function runs concurrently while the response completes, so there's no wasted time.
+
+---
+
+## Events reference
+
+### Client → Server
+
+| Event | Description | Key fields |
+|-------|-------------|------------|
+| `session.update` | Configure the session | `session`: configuration object |
+| `input_audio_buffer.append` | Stream an audio chunk | `audio`: base64-encoded PCM16 |
+| `input_audio_buffer.commit` | Commit buffered audio as a user turn | — |
+| `input_audio_buffer.clear` | Discard buffered audio | — |
+| `conversation.item.create` | Add a message or tool result | `item`: conversation item |
+| `conversation.item.delete` | Remove a conversation item | `item_id`: ID to remove |
+| `response.create` | Trigger the agent to respond | — |
+
+### Server → Client
+
+| Event | Description | Key fields |
+|-------|-------------|------------|
+| `session.created` | Session initialized | `session.id` |
+| `input_audio_buffer.speech_started` | User started speaking | `audio_start_ms` |
+| `input_audio_buffer.speech_stopped` | User stopped speaking | `audio_end_ms` |
+| `input_audio_buffer.committed` | Audio committed as a turn | — |
+| `conversation.item.created` | New conversation item added | `item` |
+| `conversation.item.input_audio_transcription.completed` | User speech transcribed | `transcript` |
+| `response.created` | Agent started generating a response | — |
+| `response.output_audio.delta` | Agent audio chunk | `delta`: base64 PCM16 |
+| `response.output_audio.done` | Agent audio complete | — |
+| `response.output_audio_transcript.delta` | Agent text (streaming) | `delta` |
+| `response.output_audio_transcript.done` | Agent text (final) | `transcript` |
+| `response.function_call_arguments.done` | Agent requesting a tool call | `call_id`, `name`, `arguments` |
+| `response.done` | Response complete | `response.status`: `completed` or `cancelled` |
+| `error` | Error occurred | `error.message` |
+
+---
+
+## Complete examples
+
+Production-ready examples for three integration approaches. Each handles microphone input, speaker output, turn detection, transcription, and tool calling.
+
+### Raw WebSocket
+
+Direct WebSocket control using the `websockets` library.
+
+```bash
+pip install websockets sounddevice
+```
+
+```python
+import asyncio, base64, json, threading, time
+import sounddevice as sd
+import websockets
+
+API_KEY = "YOUR_ASSEMBLYAI_API_KEY"
+WS_URL = "wss://speech-to-speech.assemblyai.com/v1/realtime"
+SAMPLE_RATE = 24000
+
+TOOLS = [{
+    "type": "function",
+    "name": "get_weather",
+    "description": "Get the current weather for a location",
+    "parameters": {
+        "type": "object",
+        "properties": {"location": {"type": "string", "description": "City name"}},
+        "required": ["location"],
+    },
+}]
+
+
+async def run_tool(name, args):
+    """Replace with your own tool implementations."""
+    if name == "get_weather":
+        return {"temperature": 72, "condition": "sunny", "location": args["location"]}
+    return {"error": f"Unknown tool: {name}"}
+
+
+class AudioPlayer:
+    def __init__(self):
+        self._buf = bytearray()
+        self._lock = threading.Lock()
+        self._out = sd.RawOutputStream(
+            samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480, latency="low",
+        )
+        self._out.start()
+
+    def play(self, pcm: bytes):
+        with self._lock:
+            self._buf.extend(pcm)
+            while len(self._buf) >= 960:
+                self._out.write(bytes(self._buf[:960]))
+                del self._buf[:960]
+
+    def close(self):
+        self._out.stop()
+        self._out.close()
+
+
+async def main():
+    player = AudioPlayer()
+    q = asyncio.Queue()
+
+    def mic_cb(data, frames, ti, status):
+        q.put_nowait(bytes(data))
+
+    mic = sd.RawInputStream(
+        samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480,
+        callback=mic_cb, latency="low",
+    )
+    mic.start()
+
+    try:
+        ws = await websockets.connect(WS_URL, extra_headers={"Authorization": f"Bearer {API_KEY}"})
+    except TypeError:
+        ws = await websockets.connect(WS_URL, additional_headers={"Authorization": f"Bearer {API_KEY}"})
+
+    await ws.send(json.dumps({"type": "session.update", "session": {
+        "input_audio_format": "pcm16", "input_audio_sample_rate": SAMPLE_RATE,
+        "output_audio_format": "pcm16", "output_audio_sample_rate": SAMPLE_RATE,
+        "input_audio_transcription": {"model": "universal-streaming"},
+        "turn_detection": {"type": "server_vad", "threshold": 0.5, "prefix_padding_ms": 300, "silence_duration_ms": 200},
+        "output_modalities": ["audio", "text"],
+        "instructions": "You are a helpful voice assistant. Keep responses brief.",
+        "voice": "sage",
+        "tools": TOOLS,
+        "tool_choice": "auto",
+    }}))
+
+    pending_tasks = {}
+
+    async def stream_mic():
+        while True:
+            try:
+                pcm = await asyncio.wait_for(q.get(), timeout=0.1)
+                await ws.send(json.dumps({"type": "input_audio_buffer.append", "audio": base64.b64encode(pcm).decode()}))
+            except asyncio.TimeoutError:
+                pass
+
+    async def handle_events():
+        async for raw in ws:
+            e = json.loads(raw)
+            et = e.get("type", "")
+            t = time.strftime("%H:%M:%S")
+
+            if et == "session.created":
+                print(f"[{t}] Connected — session {e['session']['id']}")
+            elif et == "input_audio_buffer.speech_started":
+                print(f"[{t}] You started speaking")
+            elif et == "input_audio_buffer.speech_stopped":
+                print(f"[{t}] You stopped speaking")
+            elif et == "conversation.item.input_audio_transcription.completed":
+                print(f"[{t}] You:   {e.get('transcript', '')}")
+            elif et == "response.output_audio.delta":
+                player.play(base64.b64decode(e["delta"]))
+            elif et == "response.output_audio_transcript.done":
+                print(f"[{t}] Agent: {e.get('transcript', '')}")
+            elif et == "response.function_call_arguments.done":
+                args = json.loads(e["arguments"])
+                pending_tasks[e["call_id"]] = asyncio.create_task(run_tool(e["name"], args))
+                print(f"[{t}] Tool:  {e['name']}({e['arguments']})")
+            elif et == "response.done":
+                s = e.get("response", {}).get("status", "?")
+                print(f"[{t}] Done ({s})")
+                if pending_tasks and s == "completed":
+                    for cid, task in pending_tasks.items():
+                        result = await task
+                        await ws.send(json.dumps({
+                            "type": "conversation.item.create",
+                            "item": {"type": "function_call_output", "call_id": cid, "output": json.dumps(result)},
+                        }))
+                    pending_tasks.clear()
+
+    print("Listening — start talking.\n")
+    try:
+        await asyncio.gather(stream_mic(), handle_events())
+    except KeyboardInterrupt:
+        pass
+    finally:
+        mic.stop(); mic.close(); player.close(); await ws.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### OpenAI Python SDK
+
+Uses the OpenAI GA Realtime API. Note the differences from the beta API: `websocket_base_url` instead of `base_url`, `client.realtime.connect()` instead of `client.beta.realtime.connect()`, and the nested session format.
+
+```bash
+pip install openai sounddevice
+```
+
+```python
+import asyncio, base64, json, threading, time
+import sounddevice as sd
+from openai import AsyncOpenAI
+
+API_KEY = "YOUR_ASSEMBLYAI_API_KEY"
+SAMPLE_RATE = 24000
+
+client = AsyncOpenAI(
+    api_key=API_KEY,
+    websocket_base_url="wss://speech-to-speech.assemblyai.com/v1",
+)
+
+TOOLS = [{
+    "type": "function",
+    "name": "get_weather",
+    "description": "Get the current weather for a location",
+    "parameters": {
+        "type": "object",
+        "properties": {"location": {"type": "string", "description": "City name"}},
+        "required": ["location"],
+    },
+}]
+
+
+async def run_tool(name, args):
+    """Replace with your own tool implementations."""
+    if name == "get_weather":
+        return {"temperature": 72, "condition": "sunny", "location": args["location"]}
+    return {"error": f"Unknown tool: {name}"}
+
+
+class AudioPlayer:
+    def __init__(self):
+        self._buf = bytearray()
+        self._lock = threading.Lock()
+        self._out = sd.RawOutputStream(
+            samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480, latency="low",
+        )
+        self._out.start()
+
+    def play(self, pcm: bytes):
+        with self._lock:
+            self._buf.extend(pcm)
+            while len(self._buf) >= 960:
+                self._out.write(bytes(self._buf[:960]))
+                del self._buf[:960]
+
+    def close(self):
+        self._out.stop()
+        self._out.close()
+
+
+async def main():
+    player = AudioPlayer()
+    q = asyncio.Queue()
+
+    def mic_cb(data, frames, ti, status):
+        q.put_nowait(bytes(data))
+
+    mic = sd.RawInputStream(
+        samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480,
+        callback=mic_cb, latency="low",
+    )
+    mic.start()
+
+    connection = await client.realtime.connect(
+        model="universal-streaming",
+        websocket_connection_options={"compression": None},
+    ).enter()
+
+    print("Listening — start talking.\n")
+
+    async def send_config():
+        await connection.session.update(session={
+            "instructions": "You are a helpful voice assistant. Keep responses brief.",
+            "output_modalities": ["audio", "text"],
+            "audio": {
+                "input": {
+                    "format": {"type": "audio/pcm", "rate": 24000},
+                    "transcription": {"model": "universal-streaming"},
+                    "turn_detection": {
+                        "type": "server_vad", "threshold": 0.5,
+                        "prefix_padding_ms": 300, "silence_duration_ms": 200,
+                    },
+                },
+                "output": {
+                    "format": {"type": "audio/pcm", "rate": 24000},
+                    "voice": "sage",
+                },
+            },
+            "tools": TOOLS,
+            "tool_choice": "auto",
+        })
+
+    async def stream_mic():
+        while True:
+            pcm = await q.get()
+            await connection.input_audio_buffer.append(audio=base64.b64encode(pcm).decode())
+
+    async def handle_events():
+        pending_tasks = {}
+        while True:
+            data = await connection.recv_bytes()
+            e = json.loads(data.decode("utf-8"))
+            et = e.get("type", "")
+            t = time.strftime("%H:%M:%S")
+
+            if et == "session.created":
+                print(f"[{t}] Connected — session {e['session']['id']}")
+            elif et == "input_audio_buffer.speech_started":
+                print(f"[{t}] You started speaking")
+            elif et == "input_audio_buffer.speech_stopped":
+                print(f"[{t}] You stopped speaking")
+            elif et == "conversation.item.input_audio_transcription.completed":
+                txt = e.get("transcript", "")
+                if txt:
+                    print(f"[{t}] You:   {txt}")
+            elif et == "response.output_audio.delta":
+                player.play(base64.b64decode(e["delta"]))
+            elif et == "response.output_audio_transcript.done":
+                print(f"[{t}] Agent: {e.get('transcript', '')}")
+            elif et == "response.function_call_arguments.done":
+                args = json.loads(e["arguments"])
+                pending_tasks[e["call_id"]] = asyncio.create_task(run_tool(e["name"], args))
+                print(f"[{t}] Tool:  {e['name']}({e['arguments']})")
+            elif et == "response.done":
+                s = e.get("response", {}).get("status", "?")
+                print(f"[{t}] Done ({s})")
+                if pending_tasks and s == "completed":
+                    for cid, task in pending_tasks.items():
+                        result = await task
+                        await connection.conversation.item.create(item={
+                            "type": "function_call_output", "call_id": cid,
+                            "output": json.dumps(result)})
+                    pending_tasks.clear()
+            elif et == "error":
+                print(f"[{t}] Error: {e.get('error', {})}")
+
+    try:
+        await asyncio.gather(send_config(), stream_mic(), handle_events())
+    except KeyboardInterrupt:
+        pass
+    finally:
+        mic.stop(); mic.close(); player.close()
+        await connection.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### LiveKit Agents
+
+Uses the [LiveKit Agents framework](https://docs.livekit.io/agents/) with the OpenAI Realtime plugin. LiveKit handles audio transport, room management, and client connections — you define the agent behavior.
+
+```bash
+pip install "livekit-agents[openai,silero]" python-dotenv
+```
+
+```python
+import asyncio, os
+from dotenv import load_dotenv
+from livekit.agents import Agent, AgentServer, AgentSession, JobContext, JobProcess, RunContext, cli, function_tool
+from livekit.plugins import openai, silero
+from openai.types.beta.realtime.session import TurnDetection
+from openai.types.realtime import AudioTranscription
+
+load_dotenv()
+
+
+class VoiceAgent(Agent):
+    def __init__(self):
+        super().__init__(instructions="You are a helpful voice assistant. Keep responses brief.")
+
+    @function_tool
+    async def get_weather(self, context: RunContext, location: str):
+        """Get the current weather for a location.
+
+        Args:
+            location: City name
+        """
+        return f"72 degrees and sunny in {location}."
+
+
+server = AgentServer()
+
+
+def prewarm(proc: JobProcess):
+    proc.userdata["vad"] = silero.VAD.load()
+
+
+server.setup_fnc = prewarm
+
+
+@server.rtc_session()
+async def entrypoint(ctx: JobContext):
+    session = AgentSession(
+        llm=openai.realtime.RealtimeModel(
+            base_url="wss://speech-to-speech.assemblyai.com/v1",
+            api_key=os.environ["ASSEMBLYAI_API_KEY"],
+            model="universal-streaming",
+            voice="sage",
+            input_audio_transcription=AudioTranscription(model="universal-streaming"),
+            turn_detection=TurnDetection(
+                type="server_vad",
+                threshold=0.5,
+                prefix_padding_ms=300,
+                silence_duration_ms=200,
+                create_response=True,
+            ),
+        )
+    )
+    await session.start(agent=VoiceAgent(), room=ctx.room)
+    await ctx.connect()
+
+
+if __name__ == "__main__":
+    cli.run_app(server)
+```
+
+Run with:
+
+```bash
+python agent.py console
+```

From c1b0b17aafb40fd45c3098f94472ef44105b9ab9 Mon Sep 17 00:00:00 2001
From: dan-ince-aai <dince@assemblyai.com>
Date: Wed, 18 Feb 2026 19:10:42 +0000
Subject: [PATCH 2/7] Add interactive agent generator component to S2S docs

Adds an AgentGenerator React component that lets users describe their
voice agent in natural language, pick an output format (Python/JS/config),
and opens Claude/ChatGPT/Gemini with the full S2S API reference pre-loaded
to generate a complete agent implementation.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 fern/assets/components/AgentGenerator.tsx | 453 ++++++++++++++++++++++
 1 file changed, 453 insertions(+)
 create mode 100644 fern/assets/components/AgentGenerator.tsx

diff --git a/fern/assets/components/AgentGenerator.tsx b/fern/assets/components/AgentGenerator.tsx
new file mode 100644
index 000000000..1c09db552
--- /dev/null
+++ b/fern/assets/components/AgentGenerator.tsx
@@ -0,0 +1,453 @@
+"use client";
+import * as React from "react";
+
+const DOCS_URL =
+  "https://www.assemblyai.com/docs/speech-to-text/voice-agents/speechtospeech";
+
+type OutputFormat = "python" | "javascript" | "config";
+
+const LLM_CONTEXT = `You are an expert at building real-time voice agents using the AssemblyAI Speech-to-Speech API. Based on the user's description, generate a complete voice agent implementation.
+
+## AssemblyAI Speech-to-Speech API Reference
+
+Endpoint: wss://speech-to-speech.assemblyai.com/v1/realtime
+Auth: Authorization: Bearer YOUR_ASSEMBLYAI_API_KEY header on WebSocket connect
+Audio: PCM16 (signed 16-bit little-endian), mono, 24000 Hz, base64-encoded in JSON
+Voices: sage (default), ember, breeze, cascade
+
+### Session config (flat format, for raw WebSocket):
+{
+  "type": "session.update",
+  "session": {
+    "instructions": "System prompt here",
+    "voice": "sage",
+    "input_audio_format": "pcm16",
+    "input_audio_sample_rate": 24000,
+    "output_audio_format": "pcm16",
+    "output_audio_sample_rate": 24000,
+    "input_audio_transcription": {"model": "universal-streaming"},
+    "output_modalities": ["audio", "text"],
+    "turn_detection": {"type": "server_vad", "threshold": 0.5, "prefix_padding_ms": 300, "silence_duration_ms": 200},
+    "tools": [],
+    "tool_choice": "auto"
+  }
+}
+
+### Tool definition schema:
+{
+  "type": "function",
+  "name": "tool_name",
+  "description": "What this tool does",
+  "parameters": {
+    "type": "object",
+    "properties": {
+      "param_name": {"type": "string", "description": "Param description"}
+    },
+    "required": ["param_name"]
+  }
+}
+
+### Tool calling pattern:
+- On "response.function_call_arguments.done": Start executing the function immediately (use asyncio.create_task)
+- On "response.done": Send results back via "conversation.item.create" with type "function_call_output"
+- Do NOT send "response.create" after tool results — the server continues automatically
+- Interruptions are handled automatically by server-side VAD — no client logic needed
+
+### Key events:
+Client sends: session.update, input_audio_buffer.append (base64 audio)
+Server sends: session.created, input_audio_buffer.speech_started, input_audio_buffer.speech_stopped, conversation.item.input_audio_transcription.completed, response.output_audio.delta (base64 audio), response.output_audio_transcript.done, response.function_call_arguments.done, response.done
+
+### Python quickstart template (raw WebSocket with websockets + sounddevice):
+\`\`\`python
+import asyncio, base64, json, threading, time
+import sounddevice as sd
+import websockets
+
+API_KEY = "YOUR_ASSEMBLYAI_API_KEY"
+WS_URL = "wss://speech-to-speech.assemblyai.com/v1/realtime"
+SAMPLE_RATE = 24000
+
+TOOLS = []  # Add tool definitions here
+
+async def run_tool(name, args):
+    # Implement tool logic here
+    return {"error": f"Unknown tool: {name}"}
+
+class AudioPlayer:
+    def __init__(self):
+        self._buf = bytearray()
+        self._lock = threading.Lock()
+        self._out = sd.RawOutputStream(samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480, latency="low")
+        self._out.start()
+    def play(self, pcm):
+        with self._lock:
+            self._buf.extend(pcm)
+            while len(self._buf) >= 960:
+                self._out.write(bytes(self._buf[:960]))
+                del self._buf[:960]
+    def close(self):
+        self._out.stop(); self._out.close()
+
+async def main():
+    player = AudioPlayer()
+    q = asyncio.Queue()
+    def mic_cb(data, frames, ti, status):
+        q.put_nowait(bytes(data))
+    mic = sd.RawInputStream(samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480, callback=mic_cb, latency="low")
+    mic.start()
+
+    ws = await websockets.connect(WS_URL, additional_headers={"Authorization": f"Bearer {API_KEY}"})
+    await ws.send(json.dumps({"type": "session.update", "session": {
+        "input_audio_format": "pcm16", "input_audio_sample_rate": SAMPLE_RATE,
+        "output_audio_format": "pcm16", "output_audio_sample_rate": SAMPLE_RATE,
+        "input_audio_transcription": {"model": "universal-streaming"},
+        "turn_detection": {"type": "server_vad", "threshold": 0.5, "prefix_padding_ms": 300, "silence_duration_ms": 200},
+        "output_modalities": ["audio", "text"],
+        "instructions": "SYSTEM_PROMPT_HERE",
+        "voice": "sage",
+        "tools": TOOLS,
+        "tool_choice": "auto",
+    }}))
+    pending_tasks = {}
+    async def stream_mic():
+        while True:
+            try:
+                pcm = await asyncio.wait_for(q.get(), timeout=0.1)
+                await ws.send(json.dumps({"type": "input_audio_buffer.append", "audio": base64.b64encode(pcm).decode()}))
+            except asyncio.TimeoutError:
+                pass
+    async def handle_events():
+        async for raw in ws:
+            e = json.loads(raw)
+            et = e.get("type", "")
+            t = time.strftime("%H:%M:%S")
+            if et == "session.created":
+                print(f"[{t}] Connected — session {e['session']['id']}")
+            elif et == "conversation.item.input_audio_transcription.completed":
+                print(f"[{t}] You:   {e.get('transcript', '')}")
+            elif et == "response.output_audio.delta":
+                player.play(base64.b64decode(e["delta"]))
+            elif et == "response.output_audio_transcript.done":
+                print(f"[{t}] Agent: {e.get('transcript', '')}")
+            elif et == "response.function_call_arguments.done":
+                args = json.loads(e["arguments"])
+                pending_tasks[e["call_id"]] = asyncio.create_task(run_tool(e["name"], args))
+                print(f"[{t}] Tool:  {e['name']}({e['arguments']})")
+            elif et == "response.done":
+                if pending_tasks and e.get("response", {}).get("status") == "completed":
+                    for cid, task in pending_tasks.items():
+                        result = await task
+                        await ws.send(json.dumps({"type": "conversation.item.create", "item": {"type": "function_call_output", "call_id": cid, "output": json.dumps(result)}}))
+                    pending_tasks.clear()
+    print("Listening — start talking.\\n")
+    try:
+        await asyncio.gather(stream_mic(), handle_events())
+    except KeyboardInterrupt:
+        pass
+    finally:
+        mic.stop(); mic.close(); player.close(); await ws.close()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+\`\`\`
+
+### JavaScript quickstart template (raw WebSocket in Node.js):
+\`\`\`javascript
+// Requires: npm install ws
+const WebSocket = require("ws");
+const API_KEY = "YOUR_ASSEMBLYAI_API_KEY";
+const WS_URL = "wss://speech-to-speech.assemblyai.com/v1/realtime";
+
+const TOOLS = [];  // Add tool definitions here
+
+function runTool(name, args) {
+  // Implement tool logic here
+  return { error: "Unknown tool: " + name };
+}
+
+const ws = new WebSocket(WS_URL, { headers: { Authorization: "Bearer " + API_KEY } });
+const pendingTasks = new Map();
+
+ws.on("open", () => {
+  ws.send(JSON.stringify({ type: "session.update", session: {
+    input_audio_format: "pcm16", input_audio_sample_rate: 24000,
+    output_audio_format: "pcm16", output_audio_sample_rate: 24000,
+    input_audio_transcription: { model: "universal-streaming" },
+    turn_detection: { type: "server_vad", threshold: 0.5, prefix_padding_ms: 300, silence_duration_ms: 200 },
+    output_modalities: ["audio", "text"],
+    instructions: "SYSTEM_PROMPT_HERE",
+    voice: "sage",
+    tools: TOOLS,
+    tool_choice: "auto",
+  }}));
+  // Start streaming mic audio as base64 PCM16 via input_audio_buffer.append
+});
+
+ws.on("message", async (raw) => {
+  const e = JSON.parse(raw);
+  if (e.type === "response.output_audio.delta") {
+    // Play base64-decoded PCM16 audio: Buffer.from(e.delta, "base64")
+  } else if (e.type === "response.output_audio_transcript.done") {
+    console.log("Agent:", e.transcript);
+  } else if (e.type === "response.function_call_arguments.done") {
+    pendingTasks.set(e.call_id, runTool(e.name, JSON.parse(e.arguments)));
+  } else if (e.type === "response.done" && pendingTasks.size > 0) {
+    for (const [callId, resultPromise] of pendingTasks) {
+      const result = await resultPromise;
+      ws.send(JSON.stringify({ type: "conversation.item.create", item: { type: "function_call_output", call_id: callId, output: JSON.stringify(result) } }));
+    }
+    pendingTasks.clear();
+  }
+});
+\`\`\`
+
+Full documentation: ${DOCS_URL}`;
+
+const FORMAT_INSTRUCTIONS: Record<OutputFormat, string> = {
+  python: `Generate a COMPLETE, RUNNABLE Python script using the raw WebSocket template above. Include:
+1. A detailed system prompt in the instructions field tailored to the agent's purpose
+2. All tool definitions in the TOOLS array with proper JSON schemas
+3. Full run_tool() implementation with realistic mock data for each tool
+4. All imports, the AudioPlayer class, mic handling, and event loop — everything needed to pip install and run
+Choose an appropriate voice from: sage, ember, breeze, cascade.`,
+
+  javascript: `Generate a COMPLETE, RUNNABLE JavaScript/Node.js script using the JS WebSocket template above. Include:
+1. A detailed system prompt in the instructions field tailored to the agent's purpose
+2. All tool definitions in the TOOLS array with proper JSON schemas
+3. Full runTool() implementation with realistic mock data for each tool
+4. All requires, WebSocket setup, and event handling — everything needed to npm install and run
+Choose an appropriate voice from: sage, ember, breeze, cascade.
+For audio I/O, use a comment placeholder since Node.js audio libraries vary.`,
+
+  config: `Generate ONLY the session configuration JSON (the session.update payload) with:
+1. A detailed system prompt in the instructions field tailored to the agent's purpose
+2. All tool definitions in the tools array with proper JSON schemas
+3. An appropriate voice chosen from: sage, ember, breeze, cascade
+4. All audio format and turn detection settings filled in
+Output ONLY the JSON — no script wrapper.`,
+};
+
+const MAX_DESCRIPTION_CHARS = 2000;
+const MAX_URL_LENGTH = 8000;
+
+const truncateAtWordBoundary = (text: string, maxLength: number): string => {
+  if (text.length <= maxLength) return text;
+  const truncated = text.substring(0, maxLength);
+  const lastSpace = truncated.lastIndexOf(" ");
+  if (lastSpace > 50) return text.substring(0, lastSpace);
+  return truncated;
+};
+
+export function AgentGenerator() {
+  const [description, setDescription] = React.useState("");
+  const [format, setFormat] = React.useState<OutputFormat>("python");
+
+  const buildPrompt = (maxContentLength?: number) => {
+    let descText = description || "(No description provided — generate a general-purpose helpful voice assistant)";
+
+    if (descText.length > MAX_DESCRIPTION_CHARS) {
+      descText = truncateAtWordBoundary(descText, MAX_DESCRIPTION_CHARS) + "\n\n[Description truncated]";
+    }
+
+    if (maxContentLength && descText.length > maxContentLength) {
+      descText = truncateAtWordBoundary(descText, maxContentLength) + "\n\n[Description truncated]";
+    }
+
+    return `${LLM_CONTEXT}
+
+## Output format
+${FORMAT_INSTRUCTIONS[format]}
+
+## User's agent description
+${descText}`;
+  };
+
+  const getMaxContentLength = (baseUrl: string) => {
+    const promptWithoutDesc = buildPrompt(0).replace(
+      description || "(No description provided — generate a general-purpose helpful voice assistant)",
+      ""
+    );
+    const encodedBaseLength = baseUrl.length + encodeURIComponent(promptWithoutDesc).length;
+    const available = MAX_URL_LENGTH - encodedBaseLength;
+    return Math.floor(available / 3);
+  };
+
+  const openInClaude = () => {
+    const baseUrl = "https://claude.ai/new?q=";
+    const maxLen = getMaxContentLength(baseUrl);
+    const prompt = encodeURIComponent(buildPrompt(maxLen));
+    window.open(`${baseUrl}${prompt}`, "_blank");
+  };
+
+  const openInChatGPT = () => {
+    const baseUrl = "https://chat.openai.com/?q=";
+    const maxLen = getMaxContentLength(baseUrl);
+    const prompt = encodeURIComponent(buildPrompt(maxLen));
+    window.open(`${baseUrl}${prompt}`, "_blank");
+  };
+
+  const openInGemini = () => {
+    const baseUrl = "https://aistudio.google.com/prompts/new_chat?prompt=";
+    const maxLen = getMaxContentLength(baseUrl);
+    const prompt = encodeURIComponent(buildPrompt(maxLen));
+    window.open(`${baseUrl}${prompt}`, "_blank");
+  };
+
+  const containerStyle: React.CSSProperties = {
+    border: "1px solid var(--grayscale-a4, #e5e7eb)",
+    borderRadius: "8px",
+    padding: "24px",
+    backgroundColor: "var(--grayscale-2, #f9fafb)",
+  };
+
+  const labelStyle: React.CSSProperties = {
+    display: "block",
+    fontSize: "14px",
+    fontWeight: 500,
+    marginBottom: "8px",
+    color: "var(--grayscale-12, #111827)",
+  };
+
+  const textareaStyle: React.CSSProperties = {
+    width: "100%",
+    height: "120px",
+    padding: "12px",
+    border: "1px solid var(--grayscale-a4, #d1d5db)",
+    borderRadius: "6px",
+    fontSize: "14px",
+    fontFamily: "inherit",
+    resize: "vertical",
+    backgroundColor: "var(--grayscale-1, #ffffff)",
+    color: "var(--grayscale-12, #111827)",
+  };
+
+  const charCountStyle: React.CSSProperties = {
+    fontSize: "12px",
+    color: "var(--grayscale-11, #6b7280)",
+    marginTop: "4px",
+  };
+
+  const toggleContainerStyle: React.CSSProperties = {
+    display: "flex",
+    gap: "4px",
+    padding: "4px",
+    backgroundColor: "var(--grayscale-a3, #e5e7eb)",
+    borderRadius: "6px",
+    width: "fit-content",
+  };
+
+  const toggleButtonStyle = (active: boolean): React.CSSProperties => ({
+    padding: "6px 16px",
+    border: "none",
+    borderRadius: "4px",
+    fontSize: "13px",
+    fontWeight: 500,
+    cursor: "pointer",
+    backgroundColor: active ? "var(--grayscale-1, #ffffff)" : "transparent",
+    color: active ? "var(--grayscale-12, #111827)" : "var(--grayscale-11, #6b7280)",
+    boxShadow: active ? "0 1px 2px rgba(0,0,0,0.08)" : "none",
+    transition: "all 0.15s ease",
+  });
+
+  const buttonBaseStyle: React.CSSProperties = {
+    display: "inline-flex",
+    alignItems: "center",
+    gap: "8px",
+    padding: "10px 20px",
+    border: "none",
+    borderRadius: "6px",
+    fontSize: "14px",
+    fontWeight: 500,
+    cursor: "pointer",
+    color: "#ffffff",
+  };
+
+  const helpTextStyle: React.CSSProperties = {
+    marginTop: "12px",
+    fontSize: "13px",
+    color: "var(--grayscale-11, #6b7280)",
+  };
+
+  return (
+    <div style={containerStyle}>
+      <div style={{ display: "flex", flexDirection: "column", gap: "16px" }}>
+        <div>
+          <label style={labelStyle}>Describe your agent</label>
+          <textarea
+            value={description}
+            onChange={(e) => setDescription(e.target.value)}
+            placeholder="A customer service agent for a pizza delivery company. It can check order status by order number, estimate delivery time, process refunds, and answer questions about the menu. It should be friendly and concise."
+            style={textareaStyle}
+          />
+          <div style={charCountStyle}>
+            {description.length > 0 &&
+              `${description.length.toLocaleString()} / ${MAX_DESCRIPTION_CHARS.toLocaleString()} characters`}
+            {description.length > MAX_DESCRIPTION_CHARS && " — will be truncated"}
+          </div>
+        </div>
+
+        <div>
+          <label style={labelStyle}>Output format</label>
+          <div style={toggleContainerStyle}>
+            <button
+              onClick={() => setFormat("python")}
+              style={toggleButtonStyle(format === "python")}
+            >
+              Python
+            </button>
+            <button
+              onClick={() => setFormat("javascript")}
+              style={toggleButtonStyle(format === "javascript")}
+            >
+              JavaScript
+            </button>
+            <button
+              onClick={() => setFormat("config")}
+              style={toggleButtonStyle(format === "config")}
+            >
+              Config only
+            </button>
+          </div>
+        </div>
+
+        <div style={{ marginTop: "8px" }}>
+          <label style={labelStyle}>Generate with AI</label>
+          <div style={{ display: "flex", gap: "12px", flexWrap: "wrap" }}>
+            <button
+              onClick={openInClaude}
+              style={{ ...buttonBaseStyle, backgroundColor: "#d97706" }}
+            >
+              <svg width="16" height="16" viewBox="0 0 16 17" fill="none" xmlns="http://www.w3.org/2000/svg">
+                <path fillRule="evenodd" clipRule="evenodd" d="M9.218 2.52954H11.62L16 13.5162H13.598L9.218 2.52954ZM4.37933 2.52954H6.89067L11.2707 13.5162H8.82133L7.926 11.2089H3.34467L2.44867 13.5155H0L4.38 2.53087L4.37933 2.52954ZM7.134 9.16887L5.63533 5.30754L4.13667 9.16954H7.13333L7.134 9.16887Z" fill="currentColor"/>
+              </svg>
+              Open in Claude
+            </button>
+            <button
+              onClick={openInChatGPT}
+              style={{ ...buttonBaseStyle, backgroundColor: "#10a37f" }}
+            >
+              <svg width="16" height="16" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg">
+                <path d="M22.0606 9.86697C22.6034 8.23781 22.4165 6.45314 21.5485 4.97127C20.2431 2.69837 17.6188 1.52902 15.0558 2.0793C13.9156 0.794818 12.2774 0.0643507 10.5601 0.074818C7.94025 0.0688367 5.61576 1.75557 4.80978 4.24828C3.12679 4.59295 1.67408 5.64641 0.823986 7.13949C-0.491154 9.40641-0.191341 12.264 1.56567 14.2079C1.02286 15.8371 1.20978 17.6217 2.07782 19.1036C3.38324 21.3765 6.00754 22.5458 8.57053 21.9956C9.70997 23.2801 11.3488 24.0105 13.0662 23.9993C15.6875 24.006 18.0128 22.3178 18.8188 19.8229C20.5017 19.4782 21.9545 18.4247 22.8045 16.9316C24.1182 14.6647 23.8176 11.8094 22.0614 9.86547L22.0606 9.86697ZM13.0677 22.4359C12.0188 22.4374 11.0027 22.0703 10.1974 21.3982L10.3388 21.3182L15.1029 18.5668C15.3466 18.4285 15.4961 18.169 15.4946 17.8886V11.1724L17.5081 12.335C17.5298 12.3455 17.544 12.3664 17.547 12.3903V17.9522C17.544 20.4255 15.541 22.4307 13.0677 22.4359ZM3.43483 18.3215C2.90922 17.4139 2.72006 16.35 2.90025 15.3174L3.04156 15.4019L7.80567 18.1533C8.04716 18.2946 8.34623 18.2946 8.58847 18.1533L14.4045 14.7948V17.1201C14.406 17.144 14.3948 17.1672 14.3761 17.1821L9.56044 19.9627C7.41539 21.1978 4.67595 20.4636 3.43558 18.3215H3.43483ZM2.181 7.92229C2.70436 7.01314 3.53053 6.31781 4.51445 5.95669V6.12117L4.51221 11.6247C4.51072 11.9043 4.66025 12.1638 4.90324 12.3021L10.7193 15.6599L8.70586 16.8225C8.68567 16.8359 8.66025 16.8382 8.63782 16.8285L3.82137 14.0457C1.68081 12.806 0.946603 10.0673 2.18025 7.92304L2.181 7.92229ZM18.7238 11.772L12.9077 8.41351L14.9212 7.25164C14.9414 7.23818 14.9668 7.23594 14.9892 7.24566L19.8057 10.0262C21.95 11.2651 22.6849 14.0083 21.446 16.1526C20.9219 17.0602 20.0965 17.7556 19.1133 18.1174V12.4494C19.1156 12.1698 18.9668 11.9111 18.7245 11.772H18.7238ZM20.7275 8.75594L20.5862 8.67145L15.8221 5.92005C15.5806 5.77874 15.2816 5.77874 15.0393 5.92005L9.22324 9.27856V6.95332C9.22174 6.9294 9.23296 6.90622 9.25165 6.89127L14.0674 4.11295C16.2124 2.87557 18.9548 3.61201 20.1915 5.75781C20.7141 6.66398 20.9032 7.72491 20.726 8.75594H20.7275ZM8.12866 12.9002L6.11445 11.7376C6.09277 11.7272 6.07857 11.7062 6.07558 11.6823V6.12043C6.07707 3.64416 8.08604 1.63743 10.5623 1.63893C11.6098 1.63893 12.6236 2.00678 13.4288 2.67669L13.2875 2.75669L8.52343 5.50809C8.27969 5.64641 8.13016 5.9051 8.13165 6.18547L8.12866 12.8987V12.9002ZM9.22249 10.5421L11.8131 9.04603L14.4038 10.5414V13.5328L11.8131 15.0281L9.22249 13.5328V10.5421Z" fill="currentColor"/>
+              </svg>
+              Open in ChatGPT
+            </button>
+            <button
+              onClick={openInGemini}
+              style={{ ...buttonBaseStyle, backgroundColor: "#4285f4" }}
+            >
+              <svg width="16" height="16" viewBox="0 0 16 17" fill="none" xmlns="http://www.w3.org/2000/svg">
+                <path d="M8 16.5C8 12.634 11.134 9.5 15 9.5V8.5C11.134 8.5 8 5.366 8 1.5C8 5.366 4.866 8.5 1 8.5V9.5C4.866 9.5 8 12.634 8 16.5Z" fill="currentColor"/>
+              </svg>
+              Open in Gemini
+            </button>
+          </div>
+          <p style={helpTextStyle}>
+            Opens your preferred AI with your agent description and the full S2S
+            API reference pre-loaded. It will generate a complete agent with
+            system prompt, tool definitions, and runnable code.
+          </p>
+        </div>
+      </div>
+    </div>
+  );
+}

From 6113a695847ba3ce8d7d24cc70b3909fd6f2ba51 Mon Sep 17 00:00:00 2001
From: dan-ince-aai <dince@assemblyai.com>
Date: Wed, 18 Feb 2026 19:17:47 +0000
Subject: [PATCH 3/7] Fix S2S docs path to match main's directory structure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Main renamed 02-speech-to-text/ to speech-to-text/ — copy new docs
content to the correct path that docs.yml references.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../voice-agents/speechtospeech.mdx           | 1014 +++++++++++------
 1 file changed, 664 insertions(+), 350 deletions(-)

diff --git a/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx b/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx
index c3c28c660..595d9fd7b 100644
--- a/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx
+++ b/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx
@@ -1,467 +1,781 @@
 ---
 title: "Speech-to-Speech"
-description: "Build real-time voice AI agents that listen and respond naturally"
+description: "Build real-time voice agents with a single WebSocket connection. Stream audio in, get intelligent spoken responses back."
 ---
 
-Build voice-powered AI agents that have natural conversations with your users. Your agent listens to speech and responds with a natural-sounding voice—all in real-time.
+import { AgentGenerator } from "../../../../assets/components/AgentGenerator";
 
-<Note>
-  This is an early stage product subject to change and should not be used for
-  production usage.
-</Note>
+Build voice agents with a single WebSocket connection. Stream audio in, get intelligent spoken responses back — with built-in transcription, turn detection, and function calling. The API is compatible with the [OpenAI Realtime API](https://platform.openai.com/docs/guides/realtime), so you can use the OpenAI SDK or any OpenAI-compatible framework like LiveKit.
 
-## How it works
-
-```
-┌─────────────┐                 ┌─────────────────┐                 ┌─────────────┐
-│             │     Audio       │                 │      Audio      │             │
-│    User     │  ────────────►  │   Voice Agent   │  ────────────►  │    User     │
-│  (speaks)   │                 │                 │                 │   (hears)   │
-└─────────────┘                 └─────────────────┘                 └─────────────┘
-```
-
-1. **User speaks** — Your app captures microphone audio and streams it to the agent
-2. **Agent responds** — The agent processes the speech and generates a spoken response
-3. **User hears** — Your app receives audio and plays it through the speaker
-
-The entire flow happens in real-time with low latency.
-
----
-
-## Quick Start
+## Quickstart
 
-Get a voice agent up and running in 3 steps.
+Install dependencies and talk to your agent in under a minute.
 
-### Step 1: Get your API key
-
-Grab your API key from your [AssemblyAI dashboard](https://www.assemblyai.com/app).
-
-### Step 2: Create your agent
-
-Create an agent by sending a POST request. Here's an example of a friendly assistant:
-
-<Tabs>
-<Tab title="cURL">
 ```bash
-curl -X POST https://aaigentsv1.up.railway.app/agents \
-  -H "Authorization: YOUR_API_KEY" \
-  -H "Content-Type: application/json" \
-  -d '{
-    "agent_name": "friendly_assistant",
-    "instructions": "You are a friendly and helpful assistant. Keep your responses concise and conversational. Be warm and personable.",
-    "voice": "luna",
-    "greeting": "Say hello and ask how you can help today."
-  }'
-```
-</Tab>
-<Tab title="Python">
-```python
-import requests
-
-response = requests.post(
-"https://aaigentsv1.up.railway.app/agents",
-headers={
-"Authorization": "YOUR_API_KEY",
-"Content-Type": "application/json"
-},
-json={
-"agent_name": "friendly_assistant",
-"instructions": "You are a friendly and helpful assistant. Keep your responses concise and conversational. Be warm and personable.",
-"voice": "luna",
-"greeting": "Say hello and ask how you can help today."
-}
-)
-
-print(response.json())
-
-````
-</Tab>
-<Tab title="JavaScript">
-```javascript
-const response = await fetch("https://aaigentsv1.up.railway.app/agents", {
-  method: "POST",
-  headers: {
-    "Authorization": "YOUR_API_KEY",
-    "Content-Type": "application/json"
-  },
-  body: JSON.stringify({
-    agent_name: "friendly_assistant",
-    instructions: "You are a friendly and helpful assistant. Keep your responses concise and conversational. Be warm and personable.",
-    voice: "luna",
-    greeting: "Say hello and ask how you can help today."
-  })
-});
-
-console.log(await response.json());
-````
-
-</Tab>
-</Tabs>
-
-### Step 3: Start a conversation
-
-Connect to your agent via WebSocket and start talking:
-
+pip install websockets sounddevice
 ```
-wss://aaigentsv1.up.railway.app/ws/friendly_assistant
-```
-
-Once connected, send audio as binary WebSocket frames (PCM16, 16kHz, mono) and receive the agent's spoken responses back as audio.
 
-<Accordion title="Full Python example">
 ```python
-import asyncio
-import json
-import websockets
+import asyncio, base64, json, threading
 import sounddevice as sd
-import numpy as np
-
-async def voice_chat():
-uri = "wss://aaigentsv1.up.railway.app/ws/friendly_assistant"
-queue = asyncio.Queue(maxsize=100)
-session_ready = False
-
-    async with websockets.connect(uri, ping_interval=10, ping_timeout=20) as ws:
-        print("Connected! Waiting for session...")
-
-        # Send microphone audio to the agent
-        async def send_audio():
-            while True:
-                data = await queue.get()
-                if session_ready:
-                    await ws.send(data)
-                queue.task_done()
-
-        asyncio.create_task(send_audio())
-        loop = asyncio.get_running_loop()
-
-        def mic_callback(indata, frames, time, status):
-            if not queue.full():
-                loop.call_soon_threadsafe(queue.put_nowait, bytes(indata))
+import websockets
 
-        with sd.InputStream(samplerate=16000, channels=1, dtype='int16', callback=mic_callback), \
-             sd.OutputStream(samplerate=16000, channels=1, dtype='int16') as speaker:
+API_KEY = "YOUR_ASSEMBLYAI_API_KEY"
+WS_URL = "wss://speech-to-speech.assemblyai.com/v1/realtime"
+SAMPLE_RATE = 24000
+
+
+class AudioPlayer:
+    """Buffers and plays PCM16 audio in real time."""
+    def __init__(self):
+        self._buf = bytearray()
+        self._lock = threading.Lock()
+        self._out = sd.RawOutputStream(
+            samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480, latency="low",
+        )
+        self._out.start()
+
+    def play(self, pcm: bytes):
+        with self._lock:
+            self._buf.extend(pcm)
+            while len(self._buf) >= 960:
+                self._out.write(bytes(self._buf[:960]))
+                del self._buf[:960]
+
+    def close(self):
+        self._out.stop()
+        self._out.close()
+
+
+async def main():
+    player = AudioPlayer()
+    q = asyncio.Queue()
+
+    def mic_cb(data, frames, ti, status):
+        q.put_nowait(bytes(data))
+
+    mic = sd.RawInputStream(
+        samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480,
+        callback=mic_cb, latency="low",
+    )
+    mic.start()
+
+    ws = await websockets.connect(WS_URL, additional_headers={"Authorization": f"Bearer {API_KEY}"})
+
+    await ws.send(json.dumps({"type": "session.update", "session": {
+        "input_audio_format": "pcm16", "input_audio_sample_rate": SAMPLE_RATE,
+        "output_audio_format": "pcm16", "output_audio_sample_rate": SAMPLE_RATE,
+        "input_audio_transcription": {"model": "universal-streaming"},
+        "turn_detection": {"type": "server_vad", "threshold": 0.5, "prefix_padding_ms": 300, "silence_duration_ms": 200},
+        "output_modalities": ["audio", "text"],
+        "instructions": "You are a helpful voice assistant. Keep responses brief.",
+        "voice": "sage",
+    }}))
+
+    async def stream_mic():
+        while True:
+            try:
+                pcm = await asyncio.wait_for(q.get(), timeout=0.1)
+                await ws.send(json.dumps({"type": "input_audio_buffer.append", "audio": base64.b64encode(pcm).decode()}))
+            except asyncio.TimeoutError:
+                pass
+
+    async def handle_events():
+        async for raw in ws:
+            e = json.loads(raw)
+            et = e.get("type", "")
+            if et == "response.output_audio.delta":
+                player.play(base64.b64decode(e["delta"]))
+            elif et == "response.output_audio_transcript.done":
+                print(f"Agent: {e.get('transcript', '')}")
+            elif et == "conversation.item.input_audio_transcription.completed":
+                print(f"You:   {e.get('transcript', '')}")
+
+    print("Listening — start talking.\n")
+    try:
+        await asyncio.gather(stream_mic(), handle_events())
+    except KeyboardInterrupt:
+        pass
+    finally:
+        mic.stop(); mic.close(); player.close(); await ws.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
 
-            while True:
-                response = await ws.recv()
+Replace `YOUR_ASSEMBLYAI_API_KEY` with your key from the [AssemblyAI dashboard](https://www.assemblyai.com/dashboard/signup), run the script, and start talking.
 
-                # Play audio responses
-                if isinstance(response, bytes) and len(response):
-                    speaker.write(np.frombuffer(response, dtype=np.int16))
+---
 
-                # Handle JSON messages
-                elif isinstance(response, str):
-                    msg = json.loads(response)
+## How it works
 
-                    if msg.get("type") == "session.created":
-                        print("Session ready! Start speaking...")
-                        session_ready = True
+```
+Client                                     Server
+  |                                           |
+  |--- WebSocket connect -------------------->|
+  |--- session.update (config) -------------->|
+  |--- input_audio_buffer.append ------------>|  stream mic audio
+  |                                           |
+  |<------------ session.created -------------|
+  |<------------ speech_started --------------|  user is talking
+  |<------------ speech_stopped --------------|  user finished
+  |<------------ transcription.completed -----|  what the user said
+  |<------------ response.audio.delta --------|  agent speaks back
+  |<------------ response.done ---------------|
+  |                                           |
+```
 
-                    elif msg.get("type") == "conversation.item.done":
-                        item = msg.get("item", {})
-                        role = item.get("role")
-                        text = item.get("content", [{}])[0].get("text", "")
-                        print(f"[{role}]: {text}")
+1. **Connect** — Open a WebSocket to `wss://speech-to-speech.assemblyai.com/v1/realtime` with your API key in the `Authorization: Bearer` header.
+2. **Configure** — Send a `session.update` with your voice, instructions, turn detection settings, and any tools.
+3. **Stream audio** — Send base64-encoded PCM16 audio chunks. The server detects when the user starts and stops speaking.
+4. **Receive responses** — The server transcribes the user's speech, generates a response, and streams back audio and text in real time.
 
-asyncio.run(voice_chat())
+The API is fully compatible with the OpenAI Realtime protocol, so the [OpenAI Python SDK](https://github.com/openai/openai-python), [LiveKit Agents](https://docs.livekit.io/agents/), and any OpenAI-compatible client work out of the box — just point them at `wss://speech-to-speech.assemblyai.com/v1`.
 
-````
+---
 
-Install dependencies with:
-```bash
-pip install websockets sounddevice numpy
-````
+## Agent generator
 
-</Accordion>
+Describe your agent and we'll generate the complete code — system prompt, tool definitions, and a runnable script.
 
-That's it! You now have a working voice agent.
+<AgentGenerator />
 
 ---
 
-## Example agents
+## Configuration
 
-Here are some practical examples to inspire your own agents.
+Configure your session by sending a `session.update` event after connecting. The API accepts two session formats depending on your integration approach.
 
-### Customer support agent
+### Flat format (Raw WebSocket)
 
 ```json
 {
-  "agent_name": "support_agent",
-  "instructions": "You are a customer support agent for a software company. Be helpful, patient, and empathetic. Ask clarifying questions to understand the customer's issue. If you can't solve a problem, offer to escalate to a human agent. Keep responses brief and focused.",
-  "voice": "celeste",
-  "greeting": "Thank the customer for calling and ask how you can help them today."
+  "type": "session.update",
+  "session": {
+    "instructions": "You are a helpful voice assistant.",
+    "voice": "sage",
+    "input_audio_format": "pcm16",
+    "input_audio_sample_rate": 24000,
+    "output_audio_format": "pcm16",
+    "output_audio_sample_rate": 24000,
+    "input_audio_transcription": {"model": "universal-streaming"},
+    "output_modalities": ["audio", "text"],
+    "turn_detection": {
+      "type": "server_vad",
+      "threshold": 0.5,
+      "prefix_padding_ms": 300,
+      "silence_duration_ms": 200,
+      "create_response": true
+    },
+    "tools": [],
+    "tool_choice": "auto"
+  }
 }
 ```
 
-### Appointment scheduler
+### Nested format (OpenAI SDK / LiveKit)
+
+The OpenAI GA SDK and LiveKit plugin use a nested session format.
 
 ```json
 {
-  "agent_name": "appointment_scheduler",
-  "instructions": "You are a friendly receptionist who helps schedule appointments. Collect the caller's name, preferred date and time, and reason for the appointment. Confirm all details before ending the call. Be efficient but warm.",
-  "voice": "estelle",
-  "greeting": "Welcome the caller and ask if they'd like to schedule an appointment."
+  "type": "session.update",
+  "session": {
+    "instructions": "You are a helpful voice assistant.",
+    "output_modalities": ["audio", "text"],
+    "audio": {
+      "input": {
+        "format": {"type": "audio/pcm", "rate": 24000},
+        "transcription": {"model": "universal-streaming"},
+        "turn_detection": {
+          "type": "server_vad",
+          "threshold": 0.5,
+          "prefix_padding_ms": 300,
+          "silence_duration_ms": 200,
+          "create_response": true
+        }
+      },
+      "output": {
+        "format": {"type": "audio/pcm", "rate": 24000},
+        "voice": "sage"
+      }
+    },
+    "tools": [],
+    "tool_choice": "auto"
+  }
 }
 ```
 
-### Virtual concierge
+### Session parameters
 
-```json
-{
-  "agent_name": "hotel_concierge",
-  "instructions": "You are a luxury hotel concierge. Be warm, professional, and knowledgeable. Help guests with restaurant recommendations, local attractions, transportation, and any requests. Anticipate needs and offer personalized suggestions.",
-  "voice": "orion",
-  "greeting": "Welcome the guest and ask how you can make their stay more enjoyable."
-}
-```
+<ParamField path="instructions" type="string">
+  System prompt for the AI agent. Defines personality, behavior, and constraints.
+</ParamField>
 
----
+<ParamField path="voice" type="string" default="sage">
+  Voice for agent audio responses. One of: `sage`, `ember`, `breeze`, `cascade`.
+</ParamField>
 
-## Choose a voice
+<ParamField path="input_audio_format" type="string" default="pcm16">
+  Input audio encoding. Use `pcm16` (signed 16-bit little-endian).
+</ParamField>
 
-Pick a voice that matches your agent's personality.
+<ParamField path="input_audio_sample_rate" type="integer" default="24000">
+  Input audio sample rate in Hz.
+</ParamField>
 
-| Voice       | Style                               |
-| ----------- | ----------------------------------- |
-| `luna`      | Chill but excitable, gen-z optimist |
-| `celeste`   | Warm, laid-back, fun-loving         |
-| `orion`     | Older male, warm and happy          |
-| `ursa`      | Young male, energetic               |
-| `astra`     | Young female, wide-eyed and curious |
-| `esther`    | Older female, loving and caring     |
-| `estelle`   | Middle-aged female, sweet and kind  |
-| `andromeda` | Young female, breathy and calm      |
+<ParamField path="output_audio_format" type="string" default="pcm16">
+  Output audio encoding. Use `pcm16` (signed 16-bit little-endian).
+</ParamField>
 
----
+<ParamField path="output_audio_sample_rate" type="integer" default="24000">
+  Output audio sample rate in Hz.
+</ParamField>
 
-## Add tools
+<ParamField path="output_modalities" type="array">
+  What the agent returns. Include `"audio"` for spoken responses and `"text"` for transcripts.
+</ParamField>
 
-Tools let your agent take actions—like checking a database, calling an API, or triggering a workflow.
+<ParamField path="input_audio_transcription" type="object">
+  Enables real-time transcription of user speech. Set `model` to `"universal-streaming"`.
+</ParamField>
 
-Here's a simple example of an agent with a weather tool:
+<ParamField path="turn_detection" type="object">
+  Server-side voice activity detection. See [Turn detection](#turn-detection).
+</ParamField>
 
-```json
-{
-  "agent_name": "weather_assistant",
-  "instructions": "You help users check the weather. When they ask about weather, use the get_weather tool to look it up.",
-  "voice": "luna",
-  "tools": [
-    {
-      "name": "get_weather",
-      "description": "Get the current weather for a city",
-      "parameters": {
-        "type": "object",
-        "properties": {
-          "city": {
-            "type": "string",
-            "description": "The city name"
-          }
-        },
-        "required": ["city"]
-      }
-    }
-  ]
-}
-```
+<ParamField path="tools" type="array" default="[]">
+  Functions the agent can call. See [Tool calling](#tool-calling).
+</ParamField>
 
-When a user asks "What's the weather in Tokyo?", the agent sends your client a `tool.call` event:
+<ParamField path="tool_choice" type="string" default="auto">
+  When to use tools. `"auto"` lets the model decide.
+</ParamField>
 
-```json
-{
-  "type": "tool.call",
-  "call_id": "call_abc123",
-  "name": "get_weather",
-  "arguments": { "city": "Tokyo" }
-}
-```
+### Audio format
+
+All audio is **PCM16** (signed 16-bit integer, little-endian), **mono**, **24,000 Hz**. Audio is base64-encoded inside JSON messages. Each chunk should be approximately 20 ms (480 samples, 960 bytes).
+
+### Voices
+
+| Voice | ID |
+|-------|----|
+| Sage | `sage` |
+| Ember | `ember` |
+| Breeze | `breeze` |
+| Cascade | `cascade` |
 
-Your client executes the function and sends back the result:
+### Turn detection
+
+The server automatically detects when the user starts and stops speaking using voice activity detection (VAD). When the user finishes a turn, the agent responds automatically.
 
 ```json
-{
-  "type": "tool.result",
-  "call_id": "call_abc123",
-  "result": "{\"temperature\": \"72°F\", \"conditions\": \"sunny\"}"
+"turn_detection": {
+  "type": "server_vad",
+  "threshold": 0.5,
+  "prefix_padding_ms": 300,
+  "silence_duration_ms": 200,
+  "create_response": true
 }
 ```
 
-The agent then speaks the weather information to the user.
+<ParamField path="type" type="string" required>
+  Set to `"server_vad"` for server-side voice activity detection.
+</ParamField>
 
----
+<ParamField path="threshold" type="float" default="0.5">
+  Speech detection sensitivity (0.0 to 1.0). Lower values detect quieter speech.
+</ParamField>
 
-## Agent configuration
+<ParamField path="prefix_padding_ms" type="integer" default="300">
+  Audio to preserve before speech onset, in milliseconds. Prevents clipping the start of a sentence.
+</ParamField>
 
-Full list of options when creating an agent.
+<ParamField path="silence_duration_ms" type="integer" default="200">
+  How long the user must pause before the server considers them done speaking, in milliseconds.
+</ParamField>
 
-| Field                   | Type   | Default  | Description                                         |
-| ----------------------- | ------ | -------- | --------------------------------------------------- |
-| `agent_name`            | string | required | Unique identifier (letters, numbers, underscores)   |
-| `instructions`          | string | -        | Personality and behavior guidelines                 |
-| `voice`                 | string | `"luna"` | Voice to use for responses                          |
-| `greeting`              | string | -        | What the agent says when a conversation starts      |
-| `temperature`           | float  | `0.8`    | Response creativity (0.0 = focused, 1.0 = creative) |
-| `max_tokens`            | int    | `4096`   | Maximum response length                             |
-| `language`              | string | `"en"`   | Language code                                       |
-| `tools`                 | array  | -        | Tool definitions (see above)                        |
-| `audio_in_sample_rate`  | int    | `16000`  | Input audio sample rate in Hz                       |
-| `audio_out_sample_rate` | int    | `16000`  | Output audio sample rate in Hz                      |
+<ParamField path="create_response" type="boolean" default="true">
+  Automatically generate an agent response when the user finishes speaking.
+</ParamField>
 
 ---
 
-## WebSocket events
+## Tool calling
 
-When connected to an agent, you'll receive these events:
+Give your agent the ability to call functions in your application — look up data, take actions, or call external APIs — then continue the conversation with the result.
 
-### session.created
-
-Sent when the connection is established and ready.
+### Define tools in your session config
 
 ```json
-{
-  "type": "session.created",
-  "session": {
-    "id": "uuid",
-    "agent_name": "my_agent"
+"tools": [{
+  "type": "function",
+  "name": "get_weather",
+  "description": "Get the current weather for a location",
+  "parameters": {
+    "type": "object",
+    "properties": {
+      "location": {"type": "string", "description": "City name"}
+    },
+    "required": ["location"]
   }
-}
+}],
+"tool_choice": "auto"
 ```
 
-### conversation.item.done
+### Handle tool calls
 
-Sent when a speaker finishes talking. Contains the transcript.
+When the agent decides to call a function, the server sends `response.function_call_arguments.done` while the response is still in progress. Start executing the function immediately — you don't need to wait. When `response.done` arrives, send the result back.
 
-```json
-{
-  "type": "conversation.item.done",
-  "item": {
-    "role": "user",
-    "content": [{ "type": "text", "text": "What's the weather like?" }]
-  }
-}
+```python
+pending_tasks = {}
+
+async for raw in ws:
+    e = json.loads(raw)
+    et = e.get("type", "")
+
+    if et == "response.function_call_arguments.done":
+        # Start executing immediately — don't wait for response.done
+        args = json.loads(e["arguments"])
+        pending_tasks[e["call_id"]] = asyncio.create_task(run_tool(e["name"], args))
+
+    elif et == "response.done" and pending_tasks:
+        # Response is complete — send back the results
+        for call_id, task in pending_tasks.items():
+            result = await task
+            await ws.send(json.dumps({
+                "type": "conversation.item.create",
+                "item": {
+                    "type": "function_call_output",
+                    "call_id": call_id,
+                    "output": json.dumps(result),
+                },
+            }))
+        pending_tasks.clear()
+
+    elif et == "response.output_audio.delta":
+        player.play(base64.b64decode(e["delta"]))
 ```
 
-### conversation.item.interim
+The pattern is: **receive the call** → **start executing immediately** → **send the result when `response.done` arrives**. Your function runs concurrently while the response completes, so there's no wasted time.
 
-Sent during speech with partial transcripts. Useful for showing real-time captions.
+---
 
-```json
-{
-  "type": "conversation.item.interim",
-  "item": {
-    "role": "user",
-    "content": [{ "type": "text", "text": "What's the wea..." }]
-  }
-}
-```
+## Events reference
+
+### Client → Server
+
+| Event | Description | Key fields |
+|-------|-------------|------------|
+| `session.update` | Configure the session | `session`: configuration object |
+| `input_audio_buffer.append` | Stream an audio chunk | `audio`: base64-encoded PCM16 |
+| `input_audio_buffer.commit` | Commit buffered audio as a user turn | — |
+| `input_audio_buffer.clear` | Discard buffered audio | — |
+| `conversation.item.create` | Add a message or tool result | `item`: conversation item |
+| `conversation.item.delete` | Remove a conversation item | `item_id`: ID to remove |
+| `response.create` | Trigger the agent to respond | — |
+
+### Server → Client
+
+| Event | Description | Key fields |
+|-------|-------------|------------|
+| `session.created` | Session initialized | `session.id` |
+| `input_audio_buffer.speech_started` | User started speaking | `audio_start_ms` |
+| `input_audio_buffer.speech_stopped` | User stopped speaking | `audio_end_ms` |
+| `input_audio_buffer.committed` | Audio committed as a turn | — |
+| `conversation.item.created` | New conversation item added | `item` |
+| `conversation.item.input_audio_transcription.completed` | User speech transcribed | `transcript` |
+| `response.created` | Agent started generating a response | — |
+| `response.output_audio.delta` | Agent audio chunk | `delta`: base64 PCM16 |
+| `response.output_audio.done` | Agent audio complete | — |
+| `response.output_audio_transcript.delta` | Agent text (streaming) | `delta` |
+| `response.output_audio_transcript.done` | Agent text (final) | `transcript` |
+| `response.function_call_arguments.done` | Agent requesting a tool call | `call_id`, `name`, `arguments` |
+| `response.done` | Response complete | `response.status`: `completed` or `cancelled` |
+| `error` | Error occurred | `error.message` |
 
-### tool.call
+---
 
-Sent when the agent wants to use a tool. See [Add tools](#add-tools) for handling.
+## Complete examples
 
-### Audio (binary)
+Production-ready examples for three integration approaches. Each handles microphone input, speaker output, turn detection, transcription, and tool calling.
 
-The agent's spoken responses come as binary WebSocket frames containing PCM16 audio.
+### Raw WebSocket
 
----
+Direct WebSocket control using the `websockets` library.
 
-## Audio format
+```bash
+pip install websockets sounddevice
+```
 
-Both input and output audio use the same format:
+```python
+import asyncio, base64, json, threading, time
+import sounddevice as sd
+import websockets
 
-- **Encoding**: PCM16 (16-bit signed integer, little-endian)
-- **Sample rate**: 16,000 Hz (configurable)
-- **Channels**: Mono
+API_KEY = "YOUR_ASSEMBLYAI_API_KEY"
+WS_URL = "wss://speech-to-speech.assemblyai.com/v1/realtime"
+SAMPLE_RATE = 24000
 
----
+TOOLS = [{
+    "type": "function",
+    "name": "get_weather",
+    "description": "Get the current weather for a location",
+    "parameters": {
+        "type": "object",
+        "properties": {"location": {"type": "string", "description": "City name"}},
+        "required": ["location"],
+    },
+}]
+
+
+async def run_tool(name, args):
+    """Replace with your own tool implementations."""
+    if name == "get_weather":
+        return {"temperature": 72, "condition": "sunny", "location": args["location"]}
+    return {"error": f"Unknown tool: {name}"}
+
+
+class AudioPlayer:
+    def __init__(self):
+        self._buf = bytearray()
+        self._lock = threading.Lock()
+        self._out = sd.RawOutputStream(
+            samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480, latency="low",
+        )
+        self._out.start()
+
+    def play(self, pcm: bytes):
+        with self._lock:
+            self._buf.extend(pcm)
+            while len(self._buf) >= 960:
+                self._out.write(bytes(self._buf[:960]))
+                del self._buf[:960]
+
+    def close(self):
+        self._out.stop()
+        self._out.close()
+
+
+async def main():
+    player = AudioPlayer()
+    q = asyncio.Queue()
+
+    def mic_cb(data, frames, ti, status):
+        q.put_nowait(bytes(data))
+
+    mic = sd.RawInputStream(
+        samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480,
+        callback=mic_cb, latency="low",
+    )
+    mic.start()
+
+    try:
+        ws = await websockets.connect(WS_URL, extra_headers={"Authorization": f"Bearer {API_KEY}"})
+    except TypeError:
+        ws = await websockets.connect(WS_URL, additional_headers={"Authorization": f"Bearer {API_KEY}"})
+
+    await ws.send(json.dumps({"type": "session.update", "session": {
+        "input_audio_format": "pcm16", "input_audio_sample_rate": SAMPLE_RATE,
+        "output_audio_format": "pcm16", "output_audio_sample_rate": SAMPLE_RATE,
+        "input_audio_transcription": {"model": "universal-streaming"},
+        "turn_detection": {"type": "server_vad", "threshold": 0.5, "prefix_padding_ms": 300, "silence_duration_ms": 200},
+        "output_modalities": ["audio", "text"],
+        "instructions": "You are a helpful voice assistant. Keep responses brief.",
+        "voice": "sage",
+        "tools": TOOLS,
+        "tool_choice": "auto",
+    }}))
+
+    pending_tasks = {}
+
+    async def stream_mic():
+        while True:
+            try:
+                pcm = await asyncio.wait_for(q.get(), timeout=0.1)
+                await ws.send(json.dumps({"type": "input_audio_buffer.append", "audio": base64.b64encode(pcm).decode()}))
+            except asyncio.TimeoutError:
+                pass
+
+    async def handle_events():
+        async for raw in ws:
+            e = json.loads(raw)
+            et = e.get("type", "")
+            t = time.strftime("%H:%M:%S")
+
+            if et == "session.created":
+                print(f"[{t}] Connected — session {e['session']['id']}")
+            elif et == "input_audio_buffer.speech_started":
+                print(f"[{t}] You started speaking")
+            elif et == "input_audio_buffer.speech_stopped":
+                print(f"[{t}] You stopped speaking")
+            elif et == "conversation.item.input_audio_transcription.completed":
+                print(f"[{t}] You:   {e.get('transcript', '')}")
+            elif et == "response.output_audio.delta":
+                player.play(base64.b64decode(e["delta"]))
+            elif et == "response.output_audio_transcript.done":
+                print(f"[{t}] Agent: {e.get('transcript', '')}")
+            elif et == "response.function_call_arguments.done":
+                args = json.loads(e["arguments"])
+                pending_tasks[e["call_id"]] = asyncio.create_task(run_tool(e["name"], args))
+                print(f"[{t}] Tool:  {e['name']}({e['arguments']})")
+            elif et == "response.done":
+                s = e.get("response", {}).get("status", "?")
+                print(f"[{t}] Done ({s})")
+                if pending_tasks and s == "completed":
+                    for cid, task in pending_tasks.items():
+                        result = await task
+                        await ws.send(json.dumps({
+                            "type": "conversation.item.create",
+                            "item": {"type": "function_call_output", "call_id": cid, "output": json.dumps(result)},
+                        }))
+                    pending_tasks.clear()
+
+    print("Listening — start talking.\n")
+    try:
+        await asyncio.gather(stream_mic(), handle_events())
+    except KeyboardInterrupt:
+        pass
+    finally:
+        mic.stop(); mic.close(); player.close(); await ws.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
 
-## REST API reference
+### OpenAI Python SDK
 
-<Accordion title="Manage agents">
+Uses the OpenAI GA Realtime API. Note the differences from the beta API: `websocket_base_url` instead of `base_url`, `client.realtime.connect()` instead of `client.beta.realtime.connect()`, and the nested session format.
 
-**Base URL**: `https://aaigentsv1.up.railway.app`
+```bash
+pip install openai sounddevice
+```
 
-All REST endpoints require an `Authorization: YOUR_API_KEY` header.
+```python
+import asyncio, base64, json, threading, time
+import sounddevice as sd
+from openai import AsyncOpenAI
 
-### Create or update agent
+API_KEY = "YOUR_ASSEMBLYAI_API_KEY"
+SAMPLE_RATE = 24000
 
-`POST /agents` — Create a new agent or update an existing one.
+client = AsyncOpenAI(
+    api_key=API_KEY,
+    websocket_base_url="wss://speech-to-speech.assemblyai.com/v1",
+)
 
-### List agents
+TOOLS = [{
+    "type": "function",
+    "name": "get_weather",
+    "description": "Get the current weather for a location",
+    "parameters": {
+        "type": "object",
+        "properties": {"location": {"type": "string", "description": "City name"}},
+        "required": ["location"],
+    },
+}]
+
+
+async def run_tool(name, args):
+    """Replace with your own tool implementations."""
+    if name == "get_weather":
+        return {"temperature": 72, "condition": "sunny", "location": args["location"]}
+    return {"error": f"Unknown tool: {name}"}
+
+
+class AudioPlayer:
+    def __init__(self):
+        self._buf = bytearray()
+        self._lock = threading.Lock()
+        self._out = sd.RawOutputStream(
+            samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480, latency="low",
+        )
+        self._out.start()
+
+    def play(self, pcm: bytes):
+        with self._lock:
+            self._buf.extend(pcm)
+            while len(self._buf) >= 960:
+                self._out.write(bytes(self._buf[:960]))
+                del self._buf[:960]
+
+    def close(self):
+        self._out.stop()
+        self._out.close()
+
+
+async def main():
+    player = AudioPlayer()
+    q = asyncio.Queue()
+
+    def mic_cb(data, frames, ti, status):
+        q.put_nowait(bytes(data))
+
+    mic = sd.RawInputStream(
+        samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480,
+        callback=mic_cb, latency="low",
+    )
+    mic.start()
+
+    connection = await client.realtime.connect(
+        model="universal-streaming",
+        websocket_connection_options={"compression": None},
+    ).enter()
+
+    print("Listening — start talking.\n")
+
+    async def send_config():
+        await connection.session.update(session={
+            "instructions": "You are a helpful voice assistant. Keep responses brief.",
+            "output_modalities": ["audio", "text"],
+            "audio": {
+                "input": {
+                    "format": {"type": "audio/pcm", "rate": 24000},
+                    "transcription": {"model": "universal-streaming"},
+                    "turn_detection": {
+                        "type": "server_vad", "threshold": 0.5,
+                        "prefix_padding_ms": 300, "silence_duration_ms": 200,
+                    },
+                },
+                "output": {
+                    "format": {"type": "audio/pcm", "rate": 24000},
+                    "voice": "sage",
+                },
+            },
+            "tools": TOOLS,
+            "tool_choice": "auto",
+        })
+
+    async def stream_mic():
+        while True:
+            pcm = await q.get()
+            await connection.input_audio_buffer.append(audio=base64.b64encode(pcm).decode())
+
+    async def handle_events():
+        pending_tasks = {}
+        while True:
+            data = await connection.recv_bytes()
+            e = json.loads(data.decode("utf-8"))
+            et = e.get("type", "")
+            t = time.strftime("%H:%M:%S")
+
+            if et == "session.created":
+                print(f"[{t}] Connected — session {e['session']['id']}")
+            elif et == "input_audio_buffer.speech_started":
+                print(f"[{t}] You started speaking")
+            elif et == "input_audio_buffer.speech_stopped":
+                print(f"[{t}] You stopped speaking")
+            elif et == "conversation.item.input_audio_transcription.completed":
+                txt = e.get("transcript", "")
+                if txt:
+                    print(f"[{t}] You:   {txt}")
+            elif et == "response.output_audio.delta":
+                player.play(base64.b64decode(e["delta"]))
+            elif et == "response.output_audio_transcript.done":
+                print(f"[{t}] Agent: {e.get('transcript', '')}")
+            elif et == "response.function_call_arguments.done":
+                args = json.loads(e["arguments"])
+                pending_tasks[e["call_id"]] = asyncio.create_task(run_tool(e["name"], args))
+                print(f"[{t}] Tool:  {e['name']}({e['arguments']})")
+            elif et == "response.done":
+                s = e.get("response", {}).get("status", "?")
+                print(f"[{t}] Done ({s})")
+                if pending_tasks and s == "completed":
+                    for cid, task in pending_tasks.items():
+                        result = await task
+                        await connection.conversation.item.create(item={
+                            "type": "function_call_output", "call_id": cid,
+                            "output": json.dumps(result)})
+                    pending_tasks.clear()
+            elif et == "error":
+                print(f"[{t}] Error: {e.get('error', {})}")
+
+    try:
+        await asyncio.gather(send_config(), stream_mic(), handle_events())
+    except KeyboardInterrupt:
+        pass
+    finally:
+        mic.stop(); mic.close(); player.close()
+        await connection.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
 
-`GET /agents` — List all your agents.
+### LiveKit Agents
 
-```json
-{
-  "agents": ["agent1", "agent2"],
-  "count": 2
-}
+Uses the [LiveKit Agents framework](https://docs.livekit.io/agents/) with the OpenAI Realtime plugin. LiveKit handles audio transport, room management, and client connections — you define the agent behavior.
+
+```bash
+pip install "livekit-agents[openai,silero]" python-dotenv
 ```
 
-### Get agent
+```python
+import asyncio, os
+from dotenv import load_dotenv
+from livekit.agents import Agent, AgentServer, AgentSession, JobContext, JobProcess, RunContext, cli, function_tool
+from livekit.plugins import openai, silero
+from openai.types.beta.realtime.session import TurnDetection
+from openai.types.realtime import AudioTranscription
 
-`GET /agents/{agent_name}` — Get an agent's configuration.
+load_dotenv()
 
-### Delete agent
 
-`DELETE /agents/{agent_name}` — Delete an agent.
+class VoiceAgent(Agent):
+    def __init__(self):
+        super().__init__(instructions="You are a helpful voice assistant. Keep responses brief.")
 
-</Accordion>
+    @function_tool
+    async def get_weather(self, context: RunContext, location: str):
+        """Get the current weather for a location.
 
-<Accordion title="Conversation history">
+        Args:
+            location: City name
+        """
+        return f"72 degrees and sunny in {location}."
 
-### List conversations
 
-`GET /agents/{agent_name}/conversations` — List all conversations for an agent.
+server = AgentServer()
 
-```json
-{
-  "agent_name": "my_agent",
-  "conversations": [
-    {
-      "conversation_id": "uuid",
-      "created_at": "2025-12-18T13:00:00Z"
-    }
-  ],
-  "count": 1
-}
-```
 
-### Get conversation
+def prewarm(proc: JobProcess):
+    proc.userdata["vad"] = silero.VAD.load()
 
-`GET /agents/{agent_name}/conversations/{conversation_id}` — Get a specific conversation with all messages.
 
-```json
-{
-  "conversation_id": "uuid",
-  "agent_name": "my_agent",
-  "items": [],
-  "created_at": "2025-12-18T13:00:00Z"
-}
-```
+server.setup_fnc = prewarm
 
-</Accordion>
 
-<Accordion title="Tool definition schema">
+@server.rtc_session()
+async def entrypoint(ctx: JobContext):
+    session = AgentSession(
+        llm=openai.realtime.RealtimeModel(
+            base_url="wss://speech-to-speech.assemblyai.com/v1",
+            api_key=os.environ["ASSEMBLYAI_API_KEY"],
+            model="universal-streaming",
+            voice="sage",
+            input_audio_transcription=AudioTranscription(model="universal-streaming"),
+            turn_detection=TurnDetection(
+                type="server_vad",
+                threshold=0.5,
+                prefix_padding_ms=300,
+                silence_duration_ms=200,
+                create_response=True,
+            ),
+        )
+    )
+    await session.start(agent=VoiceAgent(), room=ctx.room)
+    await ctx.connect()
 
-Tools follow JSON Schema format:
 
-```json
-{
-  "name": "tool_name",
-  "description": "What this tool does",
-  "parameters": {
-    "type": "object",
-    "properties": {
-      "param_name": {
-        "type": "string",
-        "description": "What this parameter is for"
-      }
-    },
-    "required": ["param_name"]
-  }
-}
+if __name__ == "__main__":
+    cli.run_app(server)
 ```
 
-**Supported parameter types**: `string`, `number`, `boolean`, `array`, `object`
+Run with:
 
-</Accordion>
+```bash
+python agent.py console
+```

From dd7a67add2ae771cd5fb1c2465437ef0164abba9 Mon Sep 17 00:00:00 2001
From: dan-ince-aai <dince@assemblyai.com>
Date: Tue, 24 Feb 2026 12:49:44 +0000
Subject: [PATCH 4/7] fix quickstart

---
 .../voice-agents/speechtospeech.mdx           | 565 ++++++++++++------
 1 file changed, 378 insertions(+), 187 deletions(-)

diff --git a/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx b/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx
index 595d9fd7b..630db1833 100644
--- a/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx
+++ b/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx
@@ -12,79 +12,121 @@ Build voice agents with a single WebSocket connection. Stream audio in, get inte
 Install dependencies and talk to your agent in under a minute.
 
 ```bash
-pip install websockets sounddevice
+pip install openai sounddevice
 ```
 
 ```python
-import asyncio, base64, json, threading
+import asyncio, base64, json, queue, threading
 import sounddevice as sd
-import websockets
+from openai import AsyncOpenAI
 
 API_KEY = "YOUR_ASSEMBLYAI_API_KEY"
-WS_URL = "wss://speech-to-speech.assemblyai.com/v1/realtime"
 SAMPLE_RATE = 24000
 
+client = AsyncOpenAI(
+    api_key=API_KEY,
+    websocket_base_url="wss://speech-to-speech.assemblyai.com/v1",
+)
+
 
 class AudioPlayer:
-    """Buffers and plays PCM16 audio in real time."""
+    """Callback-based player — never blocks the event loop."""
     def __init__(self):
         self._buf = bytearray()
         self._lock = threading.Lock()
         self._out = sd.RawOutputStream(
-            samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480, latency="low",
+            samplerate=SAMPLE_RATE, channels=1, dtype="int16",
+            blocksize=480, latency="low", callback=self._callback,
         )
         self._out.start()
 
+    def _callback(self, outdata, frames, time_info, status):
+        n = frames * 2
+        with self._lock:
+            chunk = bytes(self._buf[:n])
+            del self._buf[:n]
+        if len(chunk) < n:
+            chunk += b'\x00' * (n - len(chunk))
+        outdata[:] = chunk
+
     def play(self, pcm: bytes):
         with self._lock:
             self._buf.extend(pcm)
-            while len(self._buf) >= 960:
-                self._out.write(bytes(self._buf[:960]))
-                del self._buf[:960]
+
+    def clear(self):
+        with self._lock:
+            self._buf.clear()
 
     def close(self):
-        self._out.stop()
-        self._out.close()
+        self._out.stop(); self._out.close()
 
 
 async def main():
     player = AudioPlayer()
-    q = asyncio.Queue()
+    q = queue.Queue()
 
     def mic_cb(data, frames, ti, status):
         q.put_nowait(bytes(data))
 
     mic = sd.RawInputStream(
-        samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480,
-        callback=mic_cb, latency="low",
+        samplerate=SAMPLE_RATE, channels=1, dtype="int16",
+        blocksize=480, callback=mic_cb, latency="low",
     )
     mic.start()
 
-    ws = await websockets.connect(WS_URL, additional_headers={"Authorization": f"Bearer {API_KEY}"})
+    connection = await client.realtime.connect(
+        model="universal-streaming",
+        websocket_connection_options={"compression": None},
+    ).enter()
 
-    await ws.send(json.dumps({"type": "session.update", "session": {
-        "input_audio_format": "pcm16", "input_audio_sample_rate": SAMPLE_RATE,
-        "output_audio_format": "pcm16", "output_audio_sample_rate": SAMPLE_RATE,
-        "input_audio_transcription": {"model": "universal-streaming"},
-        "turn_detection": {"type": "server_vad", "threshold": 0.5, "prefix_padding_ms": 300, "silence_duration_ms": 200},
-        "output_modalities": ["audio", "text"],
+    # Send config immediately after connecting
+    await connection.session.update(session={
+        "type": "realtime",
         "instructions": "You are a helpful voice assistant. Keep responses brief.",
-        "voice": "sage",
-    }}))
+        "output_modalities": ["audio", "text"],
+        "audio": {
+            "input": {
+                "format": {"type": "audio/pcm", "rate": SAMPLE_RATE},
+                "transcription": {"model": "universal-streaming"},
+                "turn_detection": {
+                    "type": "server_vad", "threshold": 0.5,
+                    "prefix_padding_ms": 300, "silence_duration_ms": 200,
+                },
+            },
+            "output": {
+                "format": {"type": "audio/pcm", "rate": SAMPLE_RATE},
+                "voice": "sage",
+            },
+        },
+    })
+
+    raw_ws = connection._connection
+    loop = asyncio.get_running_loop()
+
+    # Drain any mic audio that accumulated during connection setup
+    while not q.empty():
+        q.get_nowait()
 
     async def stream_mic():
         while True:
-            try:
-                pcm = await asyncio.wait_for(q.get(), timeout=0.1)
-                await ws.send(json.dumps({"type": "input_audio_buffer.append", "audio": base64.b64encode(pcm).decode()}))
-            except asyncio.TimeoutError:
-                pass
+            pcm = await loop.run_in_executor(None, q.get)
+            await raw_ws.send(json.dumps({
+                "type": "input_audio_buffer.append",
+                "audio": base64.b64encode(pcm).decode(),
+            }))
 
     async def handle_events():
-        async for raw in ws:
-            e = json.loads(raw)
+        while True:
+            data = await connection.recv_bytes()
+            e = json.loads(data.decode("utf-8"))
             et = e.get("type", "")
-            if et == "response.output_audio.delta":
+
+            if et == "session.created":
+                print(f"Connected — session {e['session']['id']}")
+                await connection.input_audio_buffer.clear()
+            elif et == "input_audio_buffer.speech_started":
+                player.clear()  # stop agent audio when user interrupts
+            elif et == "response.output_audio.delta":
                 player.play(base64.b64decode(e["delta"]))
             elif et == "response.output_audio_transcript.done":
                 print(f"Agent: {e.get('transcript', '')}")
@@ -97,7 +139,8 @@ async def main():
     except KeyboardInterrupt:
         pass
     finally:
-        mic.stop(); mic.close(); player.close(); await ws.close()
+        mic.stop(); mic.close(); player.close()
+        await connection.close()
 
 
 if __name__ == "__main__":
@@ -114,10 +157,10 @@ Replace `YOUR_ASSEMBLYAI_API_KEY` with your key from the [AssemblyAI dashboard](
 Client                                     Server
   |                                           |
   |--- WebSocket connect -------------------->|
-  |--- session.update (config) -------------->|
-  |--- input_audio_buffer.append ------------>|  stream mic audio
+  |--- session.update (config) -------------->|  send immediately after connect
+  |--- input_audio_buffer.append ------------>|  stream mic audio (start right away)
   |                                           |
-  |<------------ session.created -------------|
+  |<------------ session.created -------------|  clear buffer here
   |<------------ speech_started --------------|  user is talking
   |<------------ speech_stopped --------------|  user finished
   |<------------ transcription.completed -----|  what the user said
@@ -127,9 +170,10 @@ Client                                     Server
 ```
 
 1. **Connect** — Open a WebSocket to `wss://speech-to-speech.assemblyai.com/v1/realtime` with your API key in the `Authorization: Bearer` header.
-2. **Configure** — Send a `session.update` with your voice, instructions, turn detection settings, and any tools.
-3. **Stream audio** — Send base64-encoded PCM16 audio chunks. The server detects when the user starts and stops speaking.
-4. **Receive responses** — The server transcribes the user's speech, generates a response, and streams back audio and text in real time.
+2. **Configure** — Send `session.update` **immediately** after connecting (before `session.created` arrives). Don't wait.
+3. **Stream audio** — Start streaming mic audio right away. The server warms up on this audio before the session is fully ready.
+4. **Clear on `session.created`** — When `session.created` arrives, call `input_audio_buffer.clear()` to discard the warmup audio. From this point, fresh audio flows with your config applied.
+5. **Receive responses** — The server transcribes user speech, generates a response, and streams audio and text in real time.
 
 The API is fully compatible with the OpenAI Realtime protocol, so the [OpenAI Python SDK](https://github.com/openai/openai-python), [LiveKit Agents](https://docs.livekit.io/agents/), and any OpenAI-compatible client work out of the box — just point them at `wss://speech-to-speech.assemblyai.com/v1`.
 
@@ -145,7 +189,7 @@ Describe your agent and we'll generate the complete code — system prompt, tool
 
 ## Configuration
 
-Configure your session by sending a `session.update` event after connecting. The API accepts two session formats depending on your integration approach.
+Configure your session by sending a `session.update` event immediately after connecting. The API accepts two session formats depending on your integration approach.
 
 ### Flat format (Raw WebSocket)
 
@@ -176,12 +220,13 @@ Configure your session by sending a `session.update` event after connecting. The
 
 ### Nested format (OpenAI SDK / LiveKit)
 
-The OpenAI GA SDK and LiveKit plugin use a nested session format.
+The OpenAI GA SDK and LiveKit plugin use a nested session format. Include `"type": "realtime"` in the session object.
 
 ```json
 {
   "type": "session.update",
   "session": {
+    "type": "realtime",
     "instructions": "You are a helpful voice assistant.",
     "output_modalities": ["audio", "text"],
     "audio": {
@@ -285,7 +330,7 @@ The server automatically detects when the user starts and stops speaking using v
 </ParamField>
 
 <ParamField path="threshold" type="float" default="0.5">
-  Speech detection sensitivity (0.0 to 1.0). Lower values detect quieter speech.
+  Speech detection sensitivity (0.0 to 1.0). Lower values detect quieter speech. If the agent is triggering on background noise, raise this to `0.6`–`0.7`.
 </ParamField>
 
 <ParamField path="prefix_padding_ms" type="integer" default="300">
@@ -293,7 +338,7 @@ The server automatically detects when the user starts and stops speaking using v
 </ParamField>
 
 <ParamField path="silence_duration_ms" type="integer" default="200">
-  How long the user must pause before the server considers them done speaking, in milliseconds.
+  How long the user must pause before the server considers them done speaking, in milliseconds. Raise to `400`–`500` if the agent interrupts too eagerly.
 </ParamField>
 
 <ParamField path="create_response" type="boolean" default="true">
@@ -326,39 +371,50 @@ Give your agent the ability to call functions in your application — look up da
 
 ### Handle tool calls
 
-When the agent decides to call a function, the server sends `response.function_call_arguments.done` while the response is still in progress. Start executing the function immediately — you don't need to wait. When `response.done` arrives, send the result back.
+When the agent decides to call a function, it emits `response.output_item.done` with the complete call (name, arguments, call ID). Capture it there, then execute and return the result when `response.done` arrives.
 
 ```python
-pending_tasks = {}
+import uuid
 
-async for raw in ws:
-    e = json.loads(raw)
-    et = e.get("type", "")
+pending_calls = []  # tool calls from the current response
 
-    if et == "response.function_call_arguments.done":
-        # Start executing immediately — don't wait for response.done
-        args = json.loads(e["arguments"])
-        pending_tasks[e["call_id"]] = asyncio.create_task(run_tool(e["name"], args))
+async for data in event_stream:
+    e = json.loads(data)
+    et = e.get("type", "")
 
-    elif et == "response.done" and pending_tasks:
-        # Response is complete — send back the results
-        for call_id, task in pending_tasks.items():
-            result = await task
-            await ws.send(json.dumps({
-                "type": "conversation.item.create",
-                "item": {
+    # Capture tool calls when the output item is fully done
+    if et == "response.output_item.done":
+        item = e.get("item", {})
+        if item.get("type") == "function_call":
+            pending_calls.append({
+                "name": item["name"],
+                "call_id": item["call_id"],
+                "arguments": json.loads(item.get("arguments", "{}")),
+            })
+
+    # Execute and return results when the response is complete
+    elif et == "response.done":
+        if pending_calls:
+            calls = pending_calls[:]
+            pending_calls.clear()
+            for call in calls:
+                result = run_tool(call["name"], call["arguments"])
+                await connection.conversation.item.create(item={
+                    "id": f"item_{uuid.uuid4().hex[:24]}",  # required
                     "type": "function_call_output",
-                    "call_id": call_id,
+                    "call_id": call["call_id"],
                     "output": json.dumps(result),
-                },
-            }))
-        pending_tasks.clear()
+                })
 
     elif et == "response.output_audio.delta":
         player.play(base64.b64decode(e["delta"]))
 ```
 
-The pattern is: **receive the call** → **start executing immediately** → **send the result when `response.done` arrives**. Your function runs concurrently while the response completes, so there's no wasted time.
+The pattern is: **capture the call** via `response.output_item.done` → **wait for `response.done`** → **send all results**. The server auto-generates the follow-up response after receiving the tool output — no need to call `response.create`.
+
+<Tip>
+Include an `"id"` field in every `function_call_output` item. Without it, the server may not properly associate the result with the function call.
+</Tip>
 
 ---
 
@@ -381,17 +437,18 @@ The pattern is: **receive the call** → **start executing immediately** → **s
 | Event | Description | Key fields |
 |-------|-------------|------------|
 | `session.created` | Session initialized | `session.id` |
+| `session.updated` | Session config applied | `session` |
 | `input_audio_buffer.speech_started` | User started speaking | `audio_start_ms` |
 | `input_audio_buffer.speech_stopped` | User stopped speaking | `audio_end_ms` |
 | `input_audio_buffer.committed` | Audio committed as a turn | — |
 | `conversation.item.created` | New conversation item added | `item` |
 | `conversation.item.input_audio_transcription.completed` | User speech transcribed | `transcript` |
 | `response.created` | Agent started generating a response | — |
+| `response.output_item.done` | Output item complete (incl. function calls) | `item` |
 | `response.output_audio.delta` | Agent audio chunk | `delta`: base64 PCM16 |
 | `response.output_audio.done` | Agent audio complete | — |
 | `response.output_audio_transcript.delta` | Agent text (streaming) | `delta` |
 | `response.output_audio_transcript.done` | Agent text (final) | `transcript` |
-| `response.function_call_arguments.done` | Agent requesting a tool call | `call_id`, `name`, `arguments` |
 | `response.done` | Response complete | `response.status`: `completed` or `cancelled` |
 | `error` | Error occurred | `error.message` |
 
@@ -399,24 +456,28 @@ The pattern is: **receive the call** → **start executing immediately** → **s
 
 ## Complete examples
 
-Production-ready examples for three integration approaches. Each handles microphone input, speaker output, turn detection, transcription, and tool calling.
+Production-ready examples for three integration approaches. Each handles microphone input, speaker output, turn detection, transcription, interruptions, and tool calling.
 
-### Raw WebSocket
+### OpenAI Python SDK
 
-Direct WebSocket control using the `websockets` library.
+The recommended approach. Uses the OpenAI GA Realtime API with `client.realtime.connect()` and the nested session format.
 
 ```bash
-pip install websockets sounddevice
+pip install openai sounddevice
 ```
 
 ```python
-import asyncio, base64, json, threading, time
+import asyncio, base64, json, queue, struct, threading, time, uuid
 import sounddevice as sd
-import websockets
+from openai import AsyncOpenAI
 
 API_KEY = "YOUR_ASSEMBLYAI_API_KEY"
-WS_URL = "wss://speech-to-speech.assemblyai.com/v1/realtime"
-SAMPLE_RATE = 24000
+TARGET_RATE = 24000
+
+client = AsyncOpenAI(
+    api_key=API_KEY,
+    websocket_base_url="wss://speech-to-speech.assemblyai.com/v1",
+)
 
 TOOLS = [{
     "type": "function",
@@ -430,142 +491,234 @@ TOOLS = [{
 }]
 
 
-async def run_tool(name, args):
-    """Replace with your own tool implementations."""
+def run_tool(name, args):
     if name == "get_weather":
         return {"temperature": 72, "condition": "sunny", "location": args["location"]}
     return {"error": f"Unknown tool: {name}"}
 
 
 class AudioPlayer:
+    """Callback-based player — the audio thread pulls from the buffer,
+    so play() returns instantly and never blocks the event loop."""
     def __init__(self):
         self._buf = bytearray()
         self._lock = threading.Lock()
         self._out = sd.RawOutputStream(
-            samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480, latency="low",
+            samplerate=TARGET_RATE, channels=1, dtype="int16",
+            blocksize=480, latency="low", callback=self._callback,
         )
         self._out.start()
 
+    def _callback(self, outdata, frames, time_info, status):
+        n = frames * 2
+        with self._lock:
+            chunk = bytes(self._buf[:n])
+            del self._buf[:n]
+        if len(chunk) < n:
+            chunk += b'\x00' * (n - len(chunk))
+        outdata[:] = chunk
+
     def play(self, pcm: bytes):
         with self._lock:
             self._buf.extend(pcm)
-            while len(self._buf) >= 960:
-                self._out.write(bytes(self._buf[:960]))
-                del self._buf[:960]
+
+    def clear(self):
+        """Stop agent audio immediately (call on user interruption)."""
+        with self._lock:
+            self._buf.clear()
 
     def close(self):
-        self._out.stop()
-        self._out.close()
+        self._out.stop(); self._out.close()
 
 
 async def main():
     player = AudioPlayer()
-    q = asyncio.Queue()
+    q = queue.Queue()
+
+    # Capture at the device's native sample rate to avoid hidden driver
+    # resampling buffers. We resample to 24kHz ourselves before sending.
+    device_info = sd.query_devices(kind="input")
+    native_rate = int(device_info["default_samplerate"])
 
     def mic_cb(data, frames, ti, status):
         q.put_nowait(bytes(data))
 
     mic = sd.RawInputStream(
-        samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480,
+        samplerate=native_rate, channels=1, dtype="int16",
+        blocksize=int(native_rate * 0.02),  # ~20ms chunks
         callback=mic_cb, latency="low",
     )
     mic.start()
 
-    try:
-        ws = await websockets.connect(WS_URL, extra_headers={"Authorization": f"Bearer {API_KEY}"})
-    except TypeError:
-        ws = await websockets.connect(WS_URL, additional_headers={"Authorization": f"Bearer {API_KEY}"})
+    connection = await client.realtime.connect(
+        model="universal-streaming",
+        websocket_connection_options={"compression": None},
+    ).enter()
 
-    await ws.send(json.dumps({"type": "session.update", "session": {
-        "input_audio_format": "pcm16", "input_audio_sample_rate": SAMPLE_RATE,
-        "output_audio_format": "pcm16", "output_audio_sample_rate": SAMPLE_RATE,
-        "input_audio_transcription": {"model": "universal-streaming"},
-        "turn_detection": {"type": "server_vad", "threshold": 0.5, "prefix_padding_ms": 300, "silence_duration_ms": 200},
-        "output_modalities": ["audio", "text"],
+    # Send config IMMEDIATELY — don't wait for session.created
+    await connection.session.update(session={
+        "type": "realtime",
         "instructions": "You are a helpful voice assistant. Keep responses brief.",
-        "voice": "sage",
+        "output_modalities": ["audio", "text"],
+        "audio": {
+            "input": {
+                "format": {"type": "audio/pcm", "rate": TARGET_RATE},
+                "transcription": {"model": "universal-streaming"},
+                "turn_detection": {
+                    "type": "server_vad", "threshold": 0.5,
+                    "prefix_padding_ms": 300, "silence_duration_ms": 200,
+                },
+            },
+            "output": {
+                "format": {"type": "audio/pcm", "rate": TARGET_RATE},
+                "voice": "sage",
+            },
+        },
         "tools": TOOLS,
         "tool_choice": "auto",
-    }}))
-
-    pending_tasks = {}
+    })
+
+    # Use the underlying websocket for audio sends — bypasses SDK
+    # serialization overhead on every chunk
+    raw_ws = connection._connection
+    loop = asyncio.get_running_loop()
+
+    # Drain mic audio that accumulated during the connection handshake
+    while not q.empty():
+        q.get_nowait()
+
+    def resample(pcm: bytes, src_rate: int) -> bytes:
+        """Linear interpolation resample to TARGET_RATE."""
+        if src_rate == TARGET_RATE:
+            return pcm
+        n = len(pcm) // 2
+        samples = struct.unpack(f"<{n}h", pcm)
+        ratio = src_rate / TARGET_RATE
+        out_len = int(n / ratio)
+        out = []
+        for i in range(out_len):
+            si = i * ratio
+            idx = int(si)
+            frac = si - idx
+            s1 = samples[min(idx, n - 1)]
+            s2 = samples[min(idx + 1, n - 1)]
+            out.append(int(s1 + frac * (s2 - s1)))
+        return struct.pack(f"<{out_len}h", *out)
 
     async def stream_mic():
         while True:
-            try:
-                pcm = await asyncio.wait_for(q.get(), timeout=0.1)
-                await ws.send(json.dumps({"type": "input_audio_buffer.append", "audio": base64.b64encode(pcm).decode()}))
-            except asyncio.TimeoutError:
-                pass
+            # run_in_executor waits on the thread-safe queue without
+            # blocking the event loop
+            pcm = await loop.run_in_executor(None, q.get)
+            buf = bytearray(pcm)
+            while not q.empty():
+                try:
+                    buf.extend(q.get_nowait())
+                except Exception:
+                    break
+            await raw_ws.send(json.dumps({
+                "type": "input_audio_buffer.append",
+                "audio": base64.b64encode(resample(bytes(buf), native_rate)).decode(),
+            }))
 
     async def handle_events():
-        async for raw in ws:
-            e = json.loads(raw)
+        pending_calls = []
+        while True:
+            data = await connection.recv_bytes()
+            e = json.loads(data.decode("utf-8"))
             et = e.get("type", "")
+
+            # Fast path: audio deltas — decode and play with minimal overhead
+            if et == "response.output_audio.delta":
+                player.play(base64.b64decode(e["delta"]))
+                continue
+
             t = time.strftime("%H:%M:%S")
 
             if et == "session.created":
                 print(f"[{t}] Connected — session {e['session']['id']}")
+                # Clear warmup audio — fresh audio flows from here
+                await connection.input_audio_buffer.clear()
+
             elif et == "input_audio_buffer.speech_started":
                 print(f"[{t}] You started speaking")
+                player.clear()  # stop agent audio immediately on interruption
+
             elif et == "input_audio_buffer.speech_stopped":
                 print(f"[{t}] You stopped speaking")
+
             elif et == "conversation.item.input_audio_transcription.completed":
-                print(f"[{t}] You:   {e.get('transcript', '')}")
-            elif et == "response.output_audio.delta":
-                player.play(base64.b64decode(e["delta"]))
+                txt = e.get("transcript", "")
+                if txt:
+                    print(f"[{t}] You:   {txt}")
+
             elif et == "response.output_audio_transcript.done":
                 print(f"[{t}] Agent: {e.get('transcript', '')}")
-            elif et == "response.function_call_arguments.done":
-                args = json.loads(e["arguments"])
-                pending_tasks[e["call_id"]] = asyncio.create_task(run_tool(e["name"], args))
-                print(f"[{t}] Tool:  {e['name']}({e['arguments']})")
+
+            # Capture tool calls when the output item is fully formed
+            elif et == "response.output_item.done":
+                item = e.get("item", {})
+                if item.get("type") == "function_call":
+                    call_id = item.get("call_id", "")
+                    if not any(c["call_id"] == call_id for c in pending_calls):
+                        pending_calls.append({
+                            "name": item.get("name", ""),
+                            "call_id": call_id,
+                            "arguments": json.loads(item.get("arguments", "{}")),
+                        })
+                        print(f"[{t}] Tool:  {item['name']}({item.get('arguments', '{}')})")
+
             elif et == "response.done":
                 s = e.get("response", {}).get("status", "?")
                 print(f"[{t}] Done ({s})")
-                if pending_tasks and s == "completed":
-                    for cid, task in pending_tasks.items():
-                        result = await task
-                        await ws.send(json.dumps({
-                            "type": "conversation.item.create",
-                            "item": {"type": "function_call_output", "call_id": cid, "output": json.dumps(result)},
-                        }))
-                    pending_tasks.clear()
+                # Send all tool results — server auto-generates the follow-up response
+                if pending_calls:
+                    calls = pending_calls[:]
+                    pending_calls.clear()
+                    for call in calls:
+                        result = run_tool(call["name"], call["arguments"])
+                        print(f"[{t}]   → {call['name']}: {json.dumps(result)[:80]}")
+                        await connection.conversation.item.create(item={
+                            "id": f"item_{uuid.uuid4().hex[:24]}",
+                            "type": "function_call_output",
+                            "call_id": call["call_id"],
+                            "output": json.dumps(result),
+                        })
 
-    print("Listening — start talking.\n")
+            elif et == "error":
+                print(f"[{t}] Error: {e.get('error', {})}")
+
+    print(f"Listening — start talking.\n")
     try:
         await asyncio.gather(stream_mic(), handle_events())
     except KeyboardInterrupt:
         pass
     finally:
-        mic.stop(); mic.close(); player.close(); await ws.close()
+        mic.stop(); mic.close(); player.close()
+        await connection.close()
 
 
 if __name__ == "__main__":
     asyncio.run(main())
 ```
 
-### OpenAI Python SDK
+### Raw WebSocket
 
-Uses the OpenAI GA Realtime API. Note the differences from the beta API: `websocket_base_url` instead of `base_url`, `client.realtime.connect()` instead of `client.beta.realtime.connect()`, and the nested session format.
+Direct WebSocket control — useful if you can't use the OpenAI SDK or need the flat session format.
 
 ```bash
-pip install openai sounddevice
+pip install websockets sounddevice
 ```
 
 ```python
-import asyncio, base64, json, threading, time
+import asyncio, base64, json, queue, threading, time, uuid
 import sounddevice as sd
-from openai import AsyncOpenAI
+import websockets
 
 API_KEY = "YOUR_ASSEMBLYAI_API_KEY"
+WS_URL = "wss://speech-to-speech.assemblyai.com/v1/realtime"
 SAMPLE_RATE = 24000
 
-client = AsyncOpenAI(
-    api_key=API_KEY,
-    websocket_base_url="wss://speech-to-speech.assemblyai.com/v1",
-)
-
 TOOLS = [{
     "type": "function",
     "name": "get_weather",
@@ -578,8 +731,7 @@ TOOLS = [{
 }]
 
 
-async def run_tool(name, args):
-    """Replace with your own tool implementations."""
+def run_tool(name, args):
     if name == "get_weather":
         return {"temperature": 72, "condition": "sunny", "location": args["location"]}
     return {"error": f"Unknown tool: {name}"}
@@ -590,115 +742,154 @@ class AudioPlayer:
         self._buf = bytearray()
         self._lock = threading.Lock()
         self._out = sd.RawOutputStream(
-            samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480, latency="low",
+            samplerate=SAMPLE_RATE, channels=1, dtype="int16",
+            blocksize=480, latency="low", callback=self._callback,
         )
         self._out.start()
 
+    def _callback(self, outdata, frames, time_info, status):
+        n = frames * 2
+        with self._lock:
+            chunk = bytes(self._buf[:n])
+            del self._buf[:n]
+        if len(chunk) < n:
+            chunk += b'\x00' * (n - len(chunk))
+        outdata[:] = chunk
+
     def play(self, pcm: bytes):
         with self._lock:
             self._buf.extend(pcm)
-            while len(self._buf) >= 960:
-                self._out.write(bytes(self._buf[:960]))
-                del self._buf[:960]
+
+    def clear(self):
+        with self._lock:
+            self._buf.clear()
 
     def close(self):
-        self._out.stop()
-        self._out.close()
+        self._out.stop(); self._out.close()
 
 
 async def main():
     player = AudioPlayer()
-    q = asyncio.Queue()
+    q = queue.Queue()
 
     def mic_cb(data, frames, ti, status):
         q.put_nowait(bytes(data))
 
     mic = sd.RawInputStream(
-        samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480,
-        callback=mic_cb, latency="low",
+        samplerate=SAMPLE_RATE, channels=1, dtype="int16",
+        blocksize=480, callback=mic_cb, latency="low",
     )
     mic.start()
 
-    connection = await client.realtime.connect(
-        model="universal-streaming",
-        websocket_connection_options={"compression": None},
-    ).enter()
+    # websockets 13.x uses extra_headers, 14.x+ uses additional_headers
+    try:
+        ws = await websockets.connect(WS_URL, extra_headers={"Authorization": f"Bearer {API_KEY}"})
+    except TypeError:
+        ws = await websockets.connect(WS_URL, additional_headers={"Authorization": f"Bearer {API_KEY}"})
 
-    print("Listening — start talking.\n")
+    # Send config immediately
+    await ws.send(json.dumps({"type": "session.update", "session": {
+        "input_audio_format": "pcm16", "input_audio_sample_rate": SAMPLE_RATE,
+        "output_audio_format": "pcm16", "output_audio_sample_rate": SAMPLE_RATE,
+        "input_audio_transcription": {"model": "universal-streaming"},
+        "turn_detection": {"type": "server_vad", "threshold": 0.5,
+                           "prefix_padding_ms": 300, "silence_duration_ms": 200},
+        "output_modalities": ["audio", "text"],
+        "instructions": "You are a helpful voice assistant. Keep responses brief.",
+        "voice": "sage",
+        "tools": TOOLS,
+        "tool_choice": "auto",
+    }}))
 
-    async def send_config():
-        await connection.session.update(session={
-            "instructions": "You are a helpful voice assistant. Keep responses brief.",
-            "output_modalities": ["audio", "text"],
-            "audio": {
-                "input": {
-                    "format": {"type": "audio/pcm", "rate": 24000},
-                    "transcription": {"model": "universal-streaming"},
-                    "turn_detection": {
-                        "type": "server_vad", "threshold": 0.5,
-                        "prefix_padding_ms": 300, "silence_duration_ms": 200,
-                    },
-                },
-                "output": {
-                    "format": {"type": "audio/pcm", "rate": 24000},
-                    "voice": "sage",
-                },
-            },
-            "tools": TOOLS,
-            "tool_choice": "auto",
-        })
+    loop = asyncio.get_running_loop()
+
+    while not q.empty():
+        q.get_nowait()
 
     async def stream_mic():
         while True:
-            pcm = await q.get()
-            await connection.input_audio_buffer.append(audio=base64.b64encode(pcm).decode())
+            pcm = await loop.run_in_executor(None, q.get)
+            buf = bytearray(pcm)
+            while not q.empty():
+                try:
+                    buf.extend(q.get_nowait())
+                except Exception:
+                    break
+            await ws.send(json.dumps({
+                "type": "input_audio_buffer.append",
+                "audio": base64.b64encode(bytes(buf)).decode(),
+            }))
 
     async def handle_events():
-        pending_tasks = {}
-        while True:
-            data = await connection.recv_bytes()
-            e = json.loads(data.decode("utf-8"))
+        pending_calls = []
+        async for raw in ws:
+            e = json.loads(raw)
             et = e.get("type", "")
             t = time.strftime("%H:%M:%S")
 
             if et == "session.created":
                 print(f"[{t}] Connected — session {e['session']['id']}")
+                await ws.send(json.dumps({"type": "input_audio_buffer.clear"}))
+
             elif et == "input_audio_buffer.speech_started":
                 print(f"[{t}] You started speaking")
+                player.clear()
+
             elif et == "input_audio_buffer.speech_stopped":
                 print(f"[{t}] You stopped speaking")
+
             elif et == "conversation.item.input_audio_transcription.completed":
                 txt = e.get("transcript", "")
                 if txt:
                     print(f"[{t}] You:   {txt}")
+
             elif et == "response.output_audio.delta":
                 player.play(base64.b64decode(e["delta"]))
+
             elif et == "response.output_audio_transcript.done":
                 print(f"[{t}] Agent: {e.get('transcript', '')}")
-            elif et == "response.function_call_arguments.done":
-                args = json.loads(e["arguments"])
-                pending_tasks[e["call_id"]] = asyncio.create_task(run_tool(e["name"], args))
-                print(f"[{t}] Tool:  {e['name']}({e['arguments']})")
+
+            elif et == "response.output_item.done":
+                item = e.get("item", {})
+                if item.get("type") == "function_call":
+                    call_id = item.get("call_id", "")
+                    if not any(c["call_id"] == call_id for c in pending_calls):
+                        pending_calls.append({
+                            "name": item.get("name", ""),
+                            "call_id": call_id,
+                            "arguments": json.loads(item.get("arguments", "{}")),
+                        })
+                        print(f"[{t}] Tool:  {item['name']}({item.get('arguments', '{}')})")
+
             elif et == "response.done":
                 s = e.get("response", {}).get("status", "?")
                 print(f"[{t}] Done ({s})")
-                if pending_tasks and s == "completed":
-                    for cid, task in pending_tasks.items():
-                        result = await task
-                        await connection.conversation.item.create(item={
-                            "type": "function_call_output", "call_id": cid,
-                            "output": json.dumps(result)})
-                    pending_tasks.clear()
+                if pending_calls:
+                    calls = pending_calls[:]
+                    pending_calls.clear()
+                    for call in calls:
+                        result = run_tool(call["name"], call["arguments"])
+                        print(f"[{t}]   → {call['name']}: {json.dumps(result)[:80]}")
+                        await ws.send(json.dumps({
+                            "type": "conversation.item.create",
+                            "item": {
+                                "id": f"item_{uuid.uuid4().hex[:24]}",
+                                "type": "function_call_output",
+                                "call_id": call["call_id"],
+                                "output": json.dumps(result),
+                            },
+                        }))
+
             elif et == "error":
                 print(f"[{t}] Error: {e.get('error', {})}")
 
+    print("Listening — start talking.\n")
     try:
-        await asyncio.gather(send_config(), stream_mic(), handle_events())
+        await asyncio.gather(stream_mic(), handle_events())
     except KeyboardInterrupt:
         pass
     finally:
-        mic.stop(); mic.close(); player.close()
-        await connection.close()
+        mic.stop(); mic.close(); player.close(); await ws.close()
 
 
 if __name__ == "__main__":

From 1c0ba34f715d319c50d99a7cb93be1b1bdb41e1d Mon Sep 17 00:00:00 2001
From: dan-ince-aai <dince@assemblyai.com>
Date: Tue, 24 Feb 2026 13:29:01 +0000
Subject: [PATCH 5/7] update

---
 .../voice-agents/speechtospeech.mdx           | 2068 +++++++++++++----
 1 file changed, 1580 insertions(+), 488 deletions(-)

diff --git a/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx b/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx
index 630db1833..616771055 100644
--- a/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx
+++ b/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx
@@ -1,153 +1,1132 @@
 ---
 title: "Speech-to-Speech"
-description: "Build real-time voice agents with a single WebSocket connection. Stream audio in, get intelligent spoken responses back."
+description: "The fastest speech-to-speech API. Build production voice agents with a single WebSocket — built-in VAD, transcription, and function calling included."
 ---
 
-import { AgentGenerator } from "../../../../assets/components/AgentGenerator";
+Stream audio in, get intelligent spoken responses back — with built-in turn detection, real-time transcription, and function calling. OpenAI Realtime-compatible, so your existing clients work out of the box.
+
+<CardGroup cols={3}>
+  <Card title="300/300 knowledge grounding" icon="bullseye">
+    Perfect score on the S2S benchmark — correctly grounds responses in provided knowledge every time
+  </Card>
+  <Card title="Beats GPT Realtime on instruction following" icon="check">
+    270/300 vs 260/300 — follows complex, multi-step instructions more reliably
+  </Card>
+  <Card title="OpenAI-compatible" icon="plug">
+    Works with any OpenAI Realtime client — just swap the endpoint
+  </Card>
+</CardGroup>
+
+| Model | Pass Rate | Tool Use | Instruction Following | Knowledge Grounding |
+|-------|-----------|----------|-----------------------|---------------------|
+| **AssemblyAI** | **90.0%** | 270/300 | 270/300 | **300/300** |
+| GPT Realtime | 86.7% | **271/300** | 260/300 | **300/300** |
+| Gemini Live | 86.0% | 258/300 | 261/300 | 293/300 |
+| Grok Realtime | — | 267/300 | **275/300** | 295/300 |
+
+<Note>
+Benchmark data from [aiewf-eval](https://github.com/kwindla/aiewf-eval) — an open-source evaluation suite for speech-to-speech models covering tool use, instruction following, and knowledge grounding.
+</Note>
 
-Build voice agents with a single WebSocket connection. Stream audio in, get intelligent spoken responses back — with built-in transcription, turn detection, and function calling. The API is compatible with the [OpenAI Realtime API](https://platform.openai.com/docs/guides/realtime), so you can use the OpenAI SDK or any OpenAI-compatible framework like LiveKit.
+---
 
-## Quickstart
+## Your agent in 30 seconds
 
-Install dependencies and talk to your agent in under a minute.
+<Steps>
 
+<Step title="Install">
 ```bash
 pip install openai sounddevice
 ```
+</Step>
 
-```python
+<Step title="Get your API key">
+Grab your key from the [AssemblyAI dashboard](https://www.assemblyai.com/dashboard/signup).
+</Step>
+
+<Step title="Run this">
+```python agent.py
 import asyncio, base64, json, queue, threading
 import sounddevice as sd
 from openai import AsyncOpenAI
 
+client = AsyncOpenAI(
+    api_key="YOUR_ASSEMBLYAI_API_KEY",
+    websocket_base_url="wss://speech-to-speech.assemblyai.com/v1",
+)
+
+class AudioPlayer:
+    def __init__(self):
+        self._buf, self._lock = bytearray(), threading.Lock()
+        self._out = sd.RawOutputStream(samplerate=24000, channels=1,
+            dtype="int16", blocksize=480, latency="low", callback=self._cb)
+        self._out.start()
+    def _cb(self, out, frames, *_):
+        n = frames * 2
+        with self._lock:
+            chunk = bytes(self._buf[:n]); del self._buf[:n]
+        out[:] = chunk + b'\x00' * (n - len(chunk))
+    def play(self, pcm):
+        with self._lock: self._buf.extend(pcm)
+    def clear(self):
+        with self._lock: self._buf.clear()
+    def close(self): self._out.stop(); self._out.close()
+
+async def main():
+    player, q = AudioPlayer(), queue.Queue()
+    sd.RawInputStream(samplerate=24000, channels=1, dtype="int16",
+        blocksize=480, latency="low",
+        callback=lambda d,f,t,s: q.put_nowait(bytes(d))).start()
+
+    conn = await client.realtime.connect(
+        model="universal-streaming",
+        websocket_connection_options={"compression": None},
+    ).enter()
+
+    await conn.session.update(session={
+        "type": "realtime",
+        "instructions": "You are a helpful voice assistant. Keep responses brief.",
+        "output_modalities": ["audio", "text"],
+        "audio": {
+            "input": {"format": {"type": "audio/pcm", "rate": 24000},
+                      "transcription": {"model": "universal-streaming"},
+                      "turn_detection": {"type": "server_vad", "threshold": 0.5,
+                                         "prefix_padding_ms": 300, "silence_duration_ms": 200}},
+            "output": {"format": {"type": "audio/pcm", "rate": 24000}, "voice": "sage"},
+        },
+    })
+
+    raw_ws = conn._connection
+    loop = asyncio.get_running_loop()
+    while not q.empty(): q.get_nowait()  # drain stale mic audio
+
+    async def stream_mic():
+        while True:
+            pcm = await loop.run_in_executor(None, q.get)
+            await raw_ws.send(json.dumps({"type": "input_audio_buffer.append",
+                "audio": base64.b64encode(pcm).decode()}))
+
+    async def handle_events():
+        while True:
+            e = json.loads((await conn.recv_bytes()).decode())
+            et = e.get("type", "")
+            if et == "session.created":
+                await conn.input_audio_buffer.clear()
+            elif et == "input_audio_buffer.speech_started":
+                player.clear()
+            elif et == "response.output_audio.delta":
+                player.play(base64.b64decode(e["delta"]))
+            elif et == "response.output_audio_transcript.done":
+                print(f"Agent: {e.get('transcript', '')}")
+            elif et == "conversation.item.input_audio_transcription.completed":
+                print(f"You:   {e.get('transcript', '')}")
+
+    try:
+        await asyncio.gather(stream_mic(), handle_events())
+    except KeyboardInterrupt:
+        pass
+    finally:
+        player.close(); await conn.close()
+
+asyncio.run(main())
+```
+</Step>
+
+<Step title="Start talking">
+```bash
+python agent.py
+```
+You'll see transcripts in the terminal and hear the agent respond in real time.
+</Step>
+
+</Steps>
+
+<Tip>
+That's the entire stack — mic capture, VAD, transcription, LLM, and TTS — in one script. Keep reading to build something real.
+</Tip>
+
+---
+
+## What you can build
+
+These aren't toy demos — they're production-quality voice agents you can copy, run, and iterate on in minutes.
+
+<CardGroup cols={3}>
+  <Card title="🍕 Pizza order taker" href="#pizza-order-taker">
+    Takes a complete order through natural conversation. Handles toppings, sizes, delivery addresses, and specials. Calls a single tool at checkout — no mid-conversation tool spam. Stage-based prompting keeps the agent focused at each phase.
+  </Card>
+  <Card title="🔧 Auto service desk" href="#auto-service-desk">
+    Diagnoses car problems by asking smart follow-up questions ("does it happen all the time?", "any warning lights?"), suggests the right service, then books an appointment. The LLM's reasoning does the diagnostic work — no lookup tables needed.
+  </Card>
+  <Card title="🏥 ENT appointment setter" href="#ent-appointment-setter">
+    Routes callers to the right specialist based on their symptoms, checks doctor availability, and confirms all details before booking. Handles new vs. returning patients, allergy checks, and scheduling constraints.
+  </Card>
+</CardGroup>
+
+---
+
+## Agent examples
+
+<Tabs>
+
+<Tab title="🍕 Pizza order taker">
+
+A full pizza ordering experience — the agent answers the phone, takes the order through conversation, and only calls a tool once when the customer confirms. No tool calls mid-order.
+
+**Try saying:** *"Hi, I'd like a large pepperoni with extra cheese for delivery to 42 Main Street."*
+
+**What makes it interesting:** The `place_order` tool is called exactly once at the end with the entire order — items, delivery method, and address all gathered from conversation context. Stage-based prompting shifts the agent's instructions as the call progresses.
+
+```python Key snippets
+STAGE_INSTRUCTIONS = {
+    "greeting": """You work the phones at Sal's Pizza.
+Your opener is "Sal's Pizza, pickup or delivery?" — nothing more.""",
+
+    "ordering": """Take items one by one. "Got it, what else?" after each.
+Remember everything from the conversation — no tools yet.
+When done: read back the order, ask "that sound right?"
+Only call place_order AFTER they confirm.""",
+
+    "placed": """Order is placed. Just say the total and ETA from the result.
+Do NOT repeat the items. Do NOT call more tools.""",
+}
+
+TOOLS = [{
+    "type": "function",
+    "name": "place_order",
+    "description": "Place the complete order. Call ONCE after the customer confirms.",
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "items": {
+                "type": "array",
+                "items": {"type": "object", "properties": {
+                    "type": {"type": "string", "description": "'pizza' or 'side' or 'drink'"},
+                    "size": {"type": "string"},
+                    "toppings": {"type": "array", "items": {"type": "string"}},
+                    "special": {"type": "string", "description": "Meat Lover's, Veggie Supreme, or BBQ Chicken"},
+                    "name": {"type": "string", "description": "For sides/drinks"},
+                    "quantity": {"type": "integer"},
+                }},
+            },
+            "method": {"type": "string", "description": "'delivery' or 'pickup'"},
+            "address": {"type": "string"},
+        },
+        "required": ["items", "method"],
+    },
+}]
+```
+
+<Accordion title="Full runnable script">
+
+```python pizza_agent.py
+import asyncio, base64, datetime, json, queue, struct, threading, time, uuid
+import sounddevice as sd
+from openai import AsyncOpenAI
+
 API_KEY = "YOUR_ASSEMBLYAI_API_KEY"
-SAMPLE_RATE = 24000
+TARGET_RATE = 24000
 
 client = AsyncOpenAI(
     api_key=API_KEY,
     websocket_base_url="wss://speech-to-speech.assemblyai.com/v1",
 )
 
+MENU = {
+    "sizes": {"small": 8.99, "medium": 11.99, "large": 14.99, "extra-large": 17.99},
+    "toppings": {"pepperoni": 1.50, "sausage": 1.50, "bacon": 2.00, "ham": 1.50,
+                 "chicken": 2.00, "mushrooms": 1.00, "onions": 1.00, "peppers": 1.00,
+                 "olives": 1.00, "jalapeños": 1.00, "pineapple": 1.50, "extra cheese": 1.50},
+    "sides": {"garlic bread": 4.99, "wings (6pc)": 7.99, "wings (12pc)": 13.99, "caesar salad": 6.99},
+    "drinks": {"coke": 2.49, "diet coke": 2.49, "sprite": 2.49, "water": 1.99},
+    "specials": {
+        "Meat Lover's": {"toppings": ["pepperoni", "sausage", "bacon", "ham"], "discount": 2.00},
+        "Veggie Supreme": {"toppings": ["mushrooms", "onions", "peppers", "olives"], "discount": 2.50},
+        "BBQ Chicken": {"toppings": ["chicken", "onions", "bacon", "extra cheese"], "discount": 1.50},
+    },
+}
+
+orders = {}
+
+
+def calc_order(items):
+    line_items = []
+    for item in items:
+        if item.get("type") == "pizza":
+            size = item.get("size", "large")
+            base = MENU["sizes"].get(size, 14.99)
+            toppings = item.get("toppings", [])
+            special = item.get("special")
+            if special and special in MENU["specials"]:
+                toppings = MENU["specials"][special]["toppings"]
+                discount = MENU["specials"][special]["discount"]
+            else:
+                discount = 0
+            price = round(max(base + sum(MENU["toppings"].get(t, 1.00) for t in toppings) - discount, 0), 2)
+            line_items.append({"description": f"{size} {special or ', '.join(toppings) or 'cheese'} pizza", "price": price})
+        else:
+            name = item.get("name", "")
+            qty = item.get("quantity", 1)
+            unit = MENU["sides"].get(name) or MENU["drinks"].get(name)
+            if unit:
+                line_items.append({"description": f"{qty}x {name}", "price": round(unit * qty, 2)})
+    return line_items, sum(i["price"] for i in line_items)
+
+
+def run_tool(name, args):
+    if name != "place_order":
+        return {"error": f"Unknown tool: {name}"}
+    items = args.get("items", [])
+    method = args.get("method", "pickup")
+    address = args.get("address", "")
+    if not items:
+        return {"error": "No items in order"}
+    if method == "delivery" and not address:
+        return {"error": "Need address for delivery"}
+    line_items, subtotal = calc_order(items)
+    delivery_fee = 3.99 if method == "delivery" else 0
+    tax = round((subtotal + delivery_fee) * 0.08, 2)
+    return {
+        "confirmed": True,
+        "total": f"${round(subtotal + delivery_fee + tax, 2):.2f}",
+        "estimated_time": "30-45 min" if method == "delivery" else "15-20 min",
+    }
+
+
+TOOLS = [{
+    "type": "function",
+    "name": "place_order",
+    "description": "Place the complete order. Call ONCE after the customer confirms.",
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "items": {
+                "type": "array",
+                "items": {"type": "object", "properties": {
+                    "type": {"type": "string"},
+                    "size": {"type": "string"},
+                    "toppings": {"type": "array", "items": {"type": "string"}},
+                    "special": {"type": "string"},
+                    "name": {"type": "string"},
+                    "quantity": {"type": "integer"},
+                }},
+            },
+            "method": {"type": "string", "description": "'delivery' or 'pickup'"},
+            "address": {"type": "string"},
+        },
+        "required": ["items", "method"],
+    },
+}]
+
+VOICE = """You ARE a real person. Max 1-2 sentences. "Got it", "cool", "yep" — not "Absolutely!" or "I'd be happy to!"
+NEVER say: certainly, absolutely, I'd be happy to, great question, fantastic."""
+
+STAGE_INSTRUCTIONS = {
+    "greeting": VOICE + "\n\nYou work the phones at Sal's Pizza. Your opener: \"Sal's Pizza, pickup or delivery?\" Nothing more.",
+    "ordering": VOICE + f"""
+
+You're taking a pizza order at Sal's. Take items one by one. "Got it, what else?" after each.
+Remember everything from conversation — no tools yet. Default to regular crust.
+When done: "anything else?" once, then read back and ask "that sound right?"
+Only call place_order AFTER they confirm.
+
+Menu: small $8.99, medium $11.99, large $14.99, XL $17.99. Toppings $1-2 each.
+Specials: Meat Lover's ($2 off), Veggie Supreme ($2.50 off), BBQ Chicken ($1.50 off).
+Sides: garlic bread $4.99, wings 6pc $7.99/12pc $13.99. Drinks $2.49. Delivery: $3.99.""",
+    "placed": VOICE + "\n\nOrder placed. Just say total + ETA from the result. Do NOT repeat items. Do NOT call more tools.",
+}
+
 
 class AudioPlayer:
-    """Callback-based player — never blocks the event loop."""
     def __init__(self):
-        self._buf = bytearray()
-        self._lock = threading.Lock()
-        self._out = sd.RawOutputStream(
-            samplerate=SAMPLE_RATE, channels=1, dtype="int16",
-            blocksize=480, latency="low", callback=self._callback,
-        )
+        self._buf, self._lock = bytearray(), threading.Lock()
+        self._out = sd.RawOutputStream(samplerate=TARGET_RATE, channels=1, dtype="int16",
+            blocksize=480, latency="low", callback=self._callback)
         self._out.start()
 
-    def _callback(self, outdata, frames, time_info, status):
+    def _callback(self, outdata, frames, *_):
         n = frames * 2
         with self._lock:
-            chunk = bytes(self._buf[:n])
-            del self._buf[:n]
-        if len(chunk) < n:
-            chunk += b'\x00' * (n - len(chunk))
-        outdata[:] = chunk
+            chunk = bytes(self._buf[:n]); del self._buf[:n]
+        outdata[:] = chunk + b'\x00' * (n - len(chunk))
 
-    def play(self, pcm: bytes):
-        with self._lock:
-            self._buf.extend(pcm)
+    def play(self, pcm):
+        with self._lock: self._buf.extend(pcm)
 
     def clear(self):
-        with self._lock:
-            self._buf.clear()
+        with self._lock: self._buf.clear()
 
-    def close(self):
-        self._out.stop(); self._out.close()
+    def close(self): self._out.stop(); self._out.close()
 
 
-async def main():
-    player = AudioPlayer()
-    q = queue.Queue()
+def resample(pcm: bytes, src: int) -> bytes:
+    if src == TARGET_RATE: return pcm
+    n = len(pcm) // 2
+    samps = struct.unpack(f"<{n}h", pcm)
+    r = src / TARGET_RATE
+    out = [int(samps[min(int(i*r), n-1)] + (samps[min(int(i*r)+1, n-1)] - samps[min(int(i*r), n-1)]) * (i*r - int(i*r)))
+           for i in range(int(n / r))]
+    return struct.pack(f"<{len(out)}h", *out)
 
-    def mic_cb(data, frames, ti, status):
-        q.put_nowait(bytes(data))
 
-    mic = sd.RawInputStream(
-        samplerate=SAMPLE_RATE, channels=1, dtype="int16",
-        blocksize=480, callback=mic_cb, latency="low",
-    )
-    mic.start()
-
-    connection = await client.realtime.connect(
+async def main():
+    player, q = AudioPlayer(), queue.Queue()
+    dev = sd.query_devices(kind="input")
+    native_rate = int(dev["default_samplerate"])
+    sd.RawInputStream(samplerate=native_rate, channels=1, dtype="int16",
+        blocksize=int(native_rate * 0.02), latency="low",
+        callback=lambda d,f,t,s: q.put_nowait(bytes(d))).start()
+
+    conn = await client.realtime.connect(
         model="universal-streaming",
         websocket_connection_options={"compression": None},
     ).enter()
 
-    # Send config immediately after connecting
-    await connection.session.update(session={
+    await conn.session.update(session={
         "type": "realtime",
-        "instructions": "You are a helpful voice assistant. Keep responses brief.",
+        "instructions": STAGE_INSTRUCTIONS["greeting"],
         "output_modalities": ["audio", "text"],
         "audio": {
-            "input": {
-                "format": {"type": "audio/pcm", "rate": SAMPLE_RATE},
-                "transcription": {"model": "universal-streaming"},
-                "turn_detection": {
-                    "type": "server_vad", "threshold": 0.5,
-                    "prefix_padding_ms": 300, "silence_duration_ms": 200,
-                },
-            },
-            "output": {
-                "format": {"type": "audio/pcm", "rate": SAMPLE_RATE},
-                "voice": "sage",
-            },
+            "input": {"format": {"type": "audio/pcm", "rate": TARGET_RATE},
+                      "transcription": {"model": "universal-streaming"},
+                      "turn_detection": {"type": "server_vad", "threshold": 0.6,
+                                         "prefix_padding_ms": 300, "silence_duration_ms": 500}},
+            "output": {"format": {"type": "audio/pcm", "rate": TARGET_RATE}, "voice": "sage"},
         },
+        "tools": TOOLS, "tool_choice": "auto",
     })
 
-    raw_ws = connection._connection
+    raw_ws = conn._connection
     loop = asyncio.get_running_loop()
+    while not q.empty(): q.get_nowait()
+
+    async def set_stage(name):
+        await conn.session.update(session={"type": "realtime", "instructions": STAGE_INSTRUCTIONS[name]})
 
-    # Drain any mic audio that accumulated during connection setup
-    while not q.empty():
-        q.get_nowait()
+    stage = "greeting"
+    user_turns = 0
 
     async def stream_mic():
         while True:
             pcm = await loop.run_in_executor(None, q.get)
-            await raw_ws.send(json.dumps({
-                "type": "input_audio_buffer.append",
-                "audio": base64.b64encode(pcm).decode(),
-            }))
+            buf = bytearray(pcm)
+            while not q.empty():
+                try: buf.extend(q.get_nowait())
+                except: break
+            await raw_ws.send(json.dumps({"type": "input_audio_buffer.append",
+                "audio": base64.b64encode(resample(bytes(buf), native_rate)).decode()}))
 
     async def handle_events():
+        nonlocal stage, user_turns
+        pending_call = None
         while True:
-            data = await connection.recv_bytes()
-            e = json.loads(data.decode("utf-8"))
+            e = json.loads((await conn.recv_bytes()).decode())
             et = e.get("type", "")
-
+            if et == "response.output_audio.delta":
+                player.play(base64.b64decode(e["delta"])); continue
+            t = time.strftime("%H:%M:%S")
             if et == "session.created":
-                print(f"Connected — session {e['session']['id']}")
-                await connection.input_audio_buffer.clear()
+                await conn.input_audio_buffer.clear()
             elif et == "input_audio_buffer.speech_started":
-                player.clear()  # stop agent audio when user interrupts
-            elif et == "response.output_audio.delta":
-                player.play(base64.b64decode(e["delta"]))
+                player.clear()
+                user_turns += 1
+                if stage == "greeting" and user_turns >= 2:
+                    stage = "ordering"; await set_stage("ordering")
             elif et == "response.output_audio_transcript.done":
-                print(f"Agent: {e.get('transcript', '')}")
-            elif et == "conversation.item.input_audio_transcription.completed":
-                print(f"You:   {e.get('transcript', '')}")
+                print(f"[{t}] Sal's: {e.get('transcript', '')}")
+            elif et == "response.output_item.done":
+                item = e.get("item", {})
+                if item.get("type") == "function_call" and not pending_call:
+                    pending_call = {"call_id": item["call_id"], "arguments": json.loads(item.get("arguments", "{}"))}
+            elif et == "response.done":
+                if pending_call:
+                    stage = "placed"; await set_stage("placed")
+                    result = run_tool("place_order", pending_call["arguments"])
+                    print(f"[{t}] 🧾 {result.get('total', '?')} — {result.get('estimated_time', '?')}")
+                    await conn.conversation.item.create(item={
+                        "id": f"item_{uuid.uuid4().hex[:24]}", "type": "function_call_output",
+                        "call_id": pending_call["call_id"], "output": json.dumps(result),
+                    })
+                    pending_call = None
+                elif stage == "greeting" and e.get("response", {}).get("status") == "completed" and user_turns >= 1:
+                    stage = "ordering"; await set_stage("ordering")
+            elif et == "error":
+                print(f"[{t}] Error: {e.get('error', {})}")
 
-    print("Listening — start talking.\n")
+    print("\n🍕 Sal's Pizza — Order Line\n")
     try:
         await asyncio.gather(stream_mic(), handle_events())
     except KeyboardInterrupt:
         pass
     finally:
-        mic.stop(); mic.close(); player.close()
-        await connection.close()
+        player.close(); await conn.close()
 
 
 if __name__ == "__main__":
     asyncio.run(main())
 ```
 
-Replace `YOUR_ASSEMBLYAI_API_KEY` with your key from the [AssemblyAI dashboard](https://www.assemblyai.com/dashboard/signup), run the script, and start talking.
+</Accordion>
+
+</Tab>
+
+<Tab title="🔧 Auto service desk">
+
+A mechanic's service advisor that diagnoses car problems through conversation before booking. No lookup tables — the LLM does the reasoning.
+
+**Try saying:** *"My brakes are making a grinding noise when I slow down."*
+
+**What makes it interesting:** The agent asks smart diagnostic follow-ups ("does it happen all the time?", "any warning lights?", "what's the mileage?") and maps symptoms to the right service. One tool call at the end with everything.
+
+```python Key snippets
+STAGES = {
+    "greeting": """Answer: "Mike's Auto, this is the service desk, how can I help?"
+Find out what's going on. Don't use tools.""",
+
+    "diagnosing": """Figure out the problem. Ask one question at a time:
+- "When did it start?"
+- "Does it happen all the time or sometimes?"
+- "Any warning lights?"
+- "Roughly how many miles?"
+You know cars — grinding brakes = worn pads. Check engine light = diagnostic.
+Once you have a clear picture, confirm details and only then call book_service.""",
+
+    "booked": """Appointment booked. Just give them the date and time.
+Mention the lockbox if they're dropping off early. Say bye. No more tools.""",
+}
+
+# Symptom → service mappings the agent knows:
+# Grinding/squealing when braking → brake inspection/replacement
+# Check engine light → engine diagnostic ($95-125)
+# Won't start / slow crank → battery test
+# AC not cold → AC service ($125-250)
+# Pulling to one side → alignment check
+# Vibration at speed → tire balance
+```
+
+<Accordion title="Full runnable script">
+
+```python mechanic_agent.py
+import asyncio, base64, datetime, json, queue, struct, threading, time, uuid
+import sounddevice as sd
+from openai import AsyncOpenAI
+
+API_KEY = "YOUR_ASSEMBLYAI_API_KEY"
+TARGET_RATE = 24000
+
+client = AsyncOpenAI(
+    api_key=API_KEY,
+    websocket_base_url="wss://speech-to-speech.assemblyai.com/v1",
+)
+
+TODAY = datetime.date.today()
+bookings = {}
+
+
+def get_available_slots(date_str):
+    booked = {b["time"] for b in bookings.values() if b["date"] == date_str}
+    return [f"{h:02d}:{m:02d}" for h in range(8, 17) for m in [0, 30] if f"{h:02d}:{m:02d}" not in booked]
+
+
+def run_tool(name, args):
+    if name != "book_service": return {"error": f"Unknown: {name}"}
+    customer = args.get("customer_name", "")
+    vehicle = args.get("vehicle", "")
+    issue = args.get("issue_description", "")
+    date = args.get("date", "")
+    time_slot = args.get("time", "")
+    if not all([customer, vehicle, issue, date, time_slot]):
+        return {"error": "Missing: customer_name, vehicle, issue_description, date, time"}
+    try: d = datetime.date.fromisoformat(date)
+    except ValueError: return {"error": f"Bad date: {date}"}
+    if d < TODAY: return {"error": "Can't book in the past"}
+    if d.weekday() >= 6: return {"error": "Closed Sundays"}
+    available = get_available_slots(date)
+    if time_slot not in available:
+        return {"error": f"{time_slot} taken. Open: {', '.join(available[:4])}"}
+    booking_id = f"MA-{uuid.uuid4().hex[:6].upper()}"
+    bookings[booking_id] = {"id": booking_id, "customer": customer, "vehicle": vehicle,
+                             "issue": issue, "date": date, "time": time_slot,
+                             "drop_off": args.get("drop_off", False)}
+    return {"confirmed": True, "booking_id": booking_id, "date": d.strftime("%A %B %d"),
+            "time": time_slot, "drop_off_note": "Lockbox by the front door for early drop-off." if args.get("drop_off") else None}
+
+
+TOOLS = [{
+    "type": "function",
+    "name": "book_service",
+    "description": "Book a service appointment. Call ONCE after confirming all details.",
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "customer_name": {"type": "string"},
+            "phone": {"type": "string"},
+            "vehicle": {"type": "string", "description": "Year make model (e.g. '2019 Honda Civic')"},
+            "mileage": {"type": "string"},
+            "issue_description": {"type": "string"},
+            "suggested_service": {"type": "string"},
+            "date": {"type": "string", "description": "YYYY-MM-DD"},
+            "time": {"type": "string", "description": "HH:MM (24h)"},
+            "drop_off": {"type": "boolean"},
+        },
+        "required": ["customer_name", "vehicle", "issue_description", "date", "time"],
+    },
+}]
+
+VOICE = """You ARE a real person. You're the service advisor at Mike's Auto. Friendly, knows cars. Max 1-2 sentences.
+"Yeah", "got it", "mm-hmm" — not "Absolutely!" or "I'd be happy to!"
+NEVER say: certainly, absolutely, I'd be happy to, great question, fantastic."""
+
+STAGES = {
+    "greeting": VOICE + f"\n\nAnswer: \"Mike's Auto, service desk, how can I help?\"\nFind out what's wrong. Don't use tools. Today: {TODAY.strftime('%A, %B %d, %Y')}. Open Mon-Sat 8-5.",
+    "diagnosing": VOICE + f"""
+
+Figure out the problem. Ask one follow-up at a time — react to answers naturally.
+Ask: when it started, if it's constant or intermittent, warning lights, mileage, what kind of car.
+Based on symptoms, suggest what it might be and what service is needed.
+You know cars: grinding brakes = worn pads; check engine = diagnostic; won't start = battery; AC not cold = recharge.
+Once you have everything, confirm: "[name], [vehicle], [issue], [date] at [time]. That right?"
+Only call book_service AFTER they confirm. Today: {TODAY.strftime('%A, %B %d, %Y')}.""",
+    "booked": VOICE + "\n\nAppointment booked. Give date + time. Mention lockbox for early drop-off. Ask if anything else. Say bye. No more tools.",
+}
+
+
+class AudioPlayer:
+    def __init__(self):
+        self._buf, self._lock = bytearray(), threading.Lock()
+        self._out = sd.RawOutputStream(samplerate=TARGET_RATE, channels=1, dtype="int16",
+            blocksize=480, latency="low", callback=self._cb)
+        self._out.start()
+
+    def _cb(self, out, frames, *_):
+        n = frames * 2
+        with self._lock:
+            chunk = bytes(self._buf[:n]); del self._buf[:n]
+        out[:] = chunk + b'\x00' * (n - len(chunk))
+
+    def play(self, pcm):
+        with self._lock: self._buf.extend(pcm)
+
+    def clear(self):
+        with self._lock: self._buf.clear()
+
+    def close(self): self._out.stop(); self._out.close()
+
+
+def resample(pcm, src):
+    if src == TARGET_RATE: return pcm
+    n = len(pcm) // 2; samps = struct.unpack(f"<{n}h", pcm); r = src / TARGET_RATE
+    out = [int(samps[min(int(i*r), n-1)] + (samps[min(int(i*r)+1, n-1)] - samps[min(int(i*r), n-1)]) * (i*r - int(i*r)))
+           for i in range(int(n / r))]
+    return struct.pack(f"<{len(out)}h", *out)
+
+
+async def main():
+    player, q = AudioPlayer(), queue.Queue()
+    dev = sd.query_devices(kind="input")
+    native_rate = int(dev["default_samplerate"])
+    sd.RawInputStream(samplerate=native_rate, channels=1, dtype="int16",
+        blocksize=int(native_rate * 0.02), latency="low",
+        callback=lambda d,f,t,s: q.put_nowait(bytes(d))).start()
+
+    conn = await client.realtime.connect(
+        model="universal-streaming", websocket_connection_options={"compression": None},
+    ).enter()
+    await conn.session.update(session={
+        "type": "realtime", "instructions": STAGES["greeting"],
+        "output_modalities": ["audio", "text"],
+        "audio": {
+            "input": {"format": {"type": "audio/pcm", "rate": TARGET_RATE},
+                      "transcription": {"model": "universal-streaming"},
+                      "turn_detection": {"type": "server_vad", "threshold": 0.6,
+                                         "prefix_padding_ms": 300, "silence_duration_ms": 500}},
+            "output": {"format": {"type": "audio/pcm", "rate": TARGET_RATE}, "voice": "sage"},
+        },
+        "tools": TOOLS, "tool_choice": "auto",
+    })
+
+    raw_ws = conn._connection
+    loop = asyncio.get_running_loop()
+    while not q.empty(): q.get_nowait()
+
+    async def set_stage(name):
+        await conn.session.update(session={"type": "realtime", "instructions": STAGES[name]})
+
+    stage = "greeting"
+    user_turns = 0
+
+    async def stream_mic():
+        while True:
+            pcm = await loop.run_in_executor(None, q.get)
+            buf = bytearray(pcm)
+            while not q.empty():
+                try: buf.extend(q.get_nowait())
+                except: break
+            await raw_ws.send(json.dumps({"type": "input_audio_buffer.append",
+                "audio": base64.b64encode(resample(bytes(buf), native_rate)).decode()}))
+
+    async def handle_events():
+        nonlocal stage, user_turns
+        pending = None
+        while True:
+            e = json.loads((await conn.recv_bytes()).decode())
+            et = e.get("type", "")
+            if et == "response.output_audio.delta":
+                player.play(base64.b64decode(e["delta"])); continue
+            t = time.strftime("%H:%M:%S")
+            if et == "session.created": await conn.input_audio_buffer.clear()
+            elif et == "input_audio_buffer.speech_started":
+                player.clear(); user_turns += 1
+                if stage == "greeting" and user_turns >= 2:
+                    stage = "diagnosing"; await set_stage("diagnosing")
+            elif et == "response.output_audio_transcript.done":
+                print(f"[{t}] Mike's: {e.get('transcript', '')}")
+            elif et == "response.output_item.done":
+                item = e.get("item", {})
+                if item.get("type") == "function_call" and not pending:
+                    pending = {"call_id": item["call_id"], "arguments": json.loads(item.get("arguments", "{}"))}
+                    print(f"[{t}] 🔧 booking...")
+            elif et == "response.done":
+                if pending:
+                    stage = "booked"; await set_stage("booked")
+                    result = run_tool("book_service", pending["arguments"])
+                    if result.get("confirmed"): print(f"[{t}] ✅ {result['date']} at {result['time']}")
+                    else:
+                        print(f"[{t}] ❌ {result.get('error')}"); stage = "diagnosing"; await set_stage("diagnosing")
+                    await conn.conversation.item.create(item={
+                        "id": f"item_{uuid.uuid4().hex[:24]}", "type": "function_call_output",
+                        "call_id": pending["call_id"], "output": json.dumps(result),
+                    })
+                    pending = None
+                elif stage == "greeting" and e.get("response", {}).get("status") == "completed" and user_turns >= 1:
+                    stage = "diagnosing"; await set_stage("diagnosing")
+            elif et == "error": print(f"[{t}] Error: {e.get('error', {})}")
+
+    print(f"\n🔧 Mike's Auto — Service Desk\nToday: {TODAY.strftime('%A, %B %d, %Y')}\n")
+    try:
+        await asyncio.gather(stream_mic(), handle_events())
+    except KeyboardInterrupt:
+        pass
+    finally:
+        player.close(); await conn.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+</Accordion>
+
+</Tab>
+
+<Tab title="🏥 ENT appointment setter">
+
+Routes callers to the right specialist based on their symptoms, finds an available slot, and books the appointment. Handles new vs. returning patients and scheduling constraints.
+
+**Try saying:** *"Hi, I've been having really bad sinus pressure for a few weeks."*
+
+**What makes it interesting:** The agent recommends the right doctor based on described symptoms (sinus → Dr. Okafor, child → Dr. Sharma, lump → Dr. Liu), gathers patient info conversationally, and only books after confirmation. Falls back gracefully on scheduling errors.
+
+```python Key snippets
+PROVIDERS = {
+    "Dr. Sarah Chen":    {"specialty": "General ENT",         "days": ["Monday","Tuesday","Wednesday","Thursday","Friday"]},
+    "Dr. Michael Okafor":{"specialty": "Sinus & Allergy",     "days": ["Monday","Wednesday","Friday"]},
+    "Dr. Priya Sharma":  {"specialty": "Pediatric ENT",       "days": ["Tuesday","Thursday"]},
+    "Dr. James Liu":     {"specialty": "Head & Neck Surgery", "days": ["Monday","Tuesday","Thursday"]},
+}
+
+# Routing logic lives in the prompt — no hard-coded rules:
+# "Sinus issues, allergies, congestion → Dr. Okafor"
+# "Kids → Dr. Sharma"
+# "Lumps, growths, post-surgical → Dr. Liu"
+# "Everything else → Dr. Chen (Mon-Fri)"
+```
+
+<Accordion title="Full runnable script">
+
+```python ent_agent.py
+import asyncio, base64, datetime, json, queue, struct, threading, time, uuid
+import sounddevice as sd
+from openai import AsyncOpenAI
+
+API_KEY = "YOUR_ASSEMBLYAI_API_KEY"
+TARGET_RATE = 24000
+
+client = AsyncOpenAI(
+    api_key=API_KEY,
+    websocket_base_url="wss://speech-to-speech.assemblyai.com/v1",
+)
+
+TODAY = datetime.date.today()
+PROVIDERS = {
+    "Dr. Sarah Chen":    {"specialty": "General ENT",         "days": ["Monday","Tuesday","Wednesday","Thursday","Friday"]},
+    "Dr. Michael Okafor":{"specialty": "Sinus & Allergy",     "days": ["Monday","Wednesday","Friday"]},
+    "Dr. Priya Sharma":  {"specialty": "Pediatric ENT",       "days": ["Tuesday","Thursday"]},
+    "Dr. James Liu":     {"specialty": "Head & Neck Surgery", "days": ["Monday","Tuesday","Thursday"]},
+}
+appointments = {}
+
+
+def get_slots(provider, date):
+    booked = {a["time"] for a in appointments.values() if a["provider"] == provider and a["date"] == date}
+    return [f"{h:02d}:{m:02d}" for h in range(8, 17) for m in [0, 30] if f"{h:02d}:{m:02d}" not in booked]
+
+
+def run_tool(name, args):
+    if name != "book_appointment": return {"error": f"Unknown: {name}"}
+    patient = args.get("patient_name", "")
+    provider = args.get("provider", "")
+    date = args.get("date", "")
+    slot = args.get("time", "")
+    reason = args.get("reason", "")
+    if not all([patient, provider, date, slot, reason]):
+        return {"error": "Missing: patient_name, provider, date, time, reason"}
+    if provider not in PROVIDERS: return {"error": f"Unknown provider. Use: {list(PROVIDERS.keys())}"}
+    try: d = datetime.date.fromisoformat(date)
+    except: return {"error": f"Bad date: {date}"}
+    day = d.strftime("%A")
+    if day not in PROVIDERS[provider]["days"]:
+        return {"error": f"{provider} doesn't work {day}s. Available: {', '.join(PROVIDERS[provider]['days'])}"}
+    if d < TODAY: return {"error": "Can't book in the past"}
+    available = get_slots(provider, date)
+    if slot not in available: return {"error": f"{slot} not available. Open: {', '.join(available[:5])}"}
+    apt_id = f"ENT-{uuid.uuid4().hex[:6].upper()}"
+    appointments[apt_id] = {"id": apt_id, "patient": patient, "provider": provider,
+                             "date": date, "time": slot, "reason": reason,
+                             "new_patient": args.get("new_patient", False)}
+    return {"confirmed": True, "appointment_id": apt_id,
+            "summary": f"{patient} with {provider} on {d.strftime('%A %B %d')} at {slot}",
+            "arrive_early": args.get("new_patient", False)}
+
+
+TOOLS = [{
+    "type": "function",
+    "name": "book_appointment",
+    "description": "Book an ENT appointment. Call ONCE after confirming all details.",
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "patient_name": {"type": "string"},
+            "provider": {"type": "string", "description": "Doctor's full name"},
+            "date": {"type": "string", "description": "YYYY-MM-DD"},
+            "time": {"type": "string", "description": "HH:MM (24h)"},
+            "reason": {"type": "string"},
+            "phone": {"type": "string"},
+            "new_patient": {"type": "boolean"},
+        },
+        "required": ["patient_name", "provider", "date", "time", "reason"],
+    },
+}]
+
+VOICE = """You ARE a real person. Warm but efficient — you're busy. Max 1-2 sentences.
+"Sure thing", "got it", "mm-hmm" — not "Absolutely!" or "I'd be happy to!"
+NEVER say: certainly, absolutely, I'd be happy to, great question, fantastic."""
+
+STAGES = {
+    "greeting": VOICE + f"\n\nYou're the receptionist at Riverside ENT Associates. Answer: \"Riverside ENT, how can I help you?\"\nFind out what they need. Ask if new or existing. Don't use tools. Today: {TODAY.strftime('%A, %B %d, %Y')}.",
+    "scheduling": VOICE + f"""
+
+You're scheduling at Riverside ENT. Gather through conversation: patient name, what they need seen for, preferred date/time, callback number.
+Suggest the right doctor based on their symptoms. Don't use tools yet.
+
+Today: {TODAY.strftime('%A, %B %d, %Y')}.
+Doctors:
+- Dr. Sarah Chen — General ENT (Mon-Fri). Good default.
+- Dr. Michael Okafor — Sinus & Allergy (Mon/Wed/Fri). Sinus, allergies, congestion.
+- Dr. Priya Sharma — Pediatric ENT (Tue/Thu). Kids.
+- Dr. James Liu — Head & Neck Surgery (Mon/Tue/Thu). Lumps, growths, post-surgical.
+
+Once you have everything: "[name] with [doctor] on [date] at [time] for [reason]. Sound right?"
+Only call book_appointment AFTER they confirm.""",
+    "booked": VOICE + "\n\nAppointment booked. Tell them the confirmation (date, time, doctor).\nIf new patient, mention arriving 15 min early for paperwork. Ask if anything else. Say bye. No more tools.",
+}
+
+
+class AudioPlayer:
+    def __init__(self):
+        self._buf, self._lock = bytearray(), threading.Lock()
+        self._out = sd.RawOutputStream(samplerate=TARGET_RATE, channels=1, dtype="int16",
+            blocksize=480, latency="low", callback=self._cb)
+        self._out.start()
+
+    def _cb(self, out, frames, *_):
+        n = frames * 2
+        with self._lock:
+            chunk = bytes(self._buf[:n]); del self._buf[:n]
+        out[:] = chunk + b'\x00' * (n - len(chunk))
+
+    def play(self, pcm):
+        with self._lock: self._buf.extend(pcm)
+
+    def clear(self):
+        with self._lock: self._buf.clear()
+
+    def close(self): self._out.stop(); self._out.close()
+
+
+def resample(pcm, src):
+    if src == TARGET_RATE: return pcm
+    n = len(pcm) // 2; samps = struct.unpack(f"<{n}h", pcm); r = src / TARGET_RATE
+    out = [int(samps[min(int(i*r), n-1)] + (samps[min(int(i*r)+1, n-1)] - samps[min(int(i*r), n-1)]) * (i*r - int(i*r)))
+           for i in range(int(n / r))]
+    return struct.pack(f"<{len(out)}h", *out)
+
+
+async def main():
+    player, q = AudioPlayer(), queue.Queue()
+    dev = sd.query_devices(kind="input")
+    native_rate = int(dev["default_samplerate"])
+    sd.RawInputStream(samplerate=native_rate, channels=1, dtype="int16",
+        blocksize=int(native_rate * 0.02), latency="low",
+        callback=lambda d,f,t,s: q.put_nowait(bytes(d))).start()
+
+    conn = await client.realtime.connect(
+        model="universal-streaming", websocket_connection_options={"compression": None},
+    ).enter()
+    await conn.session.update(session={
+        "type": "realtime", "instructions": STAGES["greeting"],
+        "output_modalities": ["audio", "text"],
+        "audio": {
+            "input": {"format": {"type": "audio/pcm", "rate": TARGET_RATE},
+                      "transcription": {"model": "universal-streaming"},
+                      "turn_detection": {"type": "server_vad", "threshold": 0.6,
+                                         "prefix_padding_ms": 300, "silence_duration_ms": 500}},
+            "output": {"format": {"type": "audio/pcm", "rate": TARGET_RATE}, "voice": "sage"},
+        },
+        "tools": TOOLS, "tool_choice": "auto",
+    })
+
+    raw_ws = conn._connection
+    loop = asyncio.get_running_loop()
+    while not q.empty(): q.get_nowait()
+
+    async def set_stage(name):
+        await conn.session.update(session={"type": "realtime", "instructions": STAGES[name]})
+
+    stage = "greeting"
+    user_turns = 0
+
+    async def stream_mic():
+        while True:
+            pcm = await loop.run_in_executor(None, q.get)
+            buf = bytearray(pcm)
+            while not q.empty():
+                try: buf.extend(q.get_nowait())
+                except: break
+            await raw_ws.send(json.dumps({"type": "input_audio_buffer.append",
+                "audio": base64.b64encode(resample(bytes(buf), native_rate)).decode()}))
+
+    async def handle_events():
+        nonlocal stage, user_turns
+        pending = None
+        while True:
+            e = json.loads((await conn.recv_bytes()).decode())
+            et = e.get("type", "")
+            if et == "response.output_audio.delta":
+                player.play(base64.b64decode(e["delta"])); continue
+            t = time.strftime("%H:%M:%S")
+            if et == "session.created": await conn.input_audio_buffer.clear()
+            elif et == "input_audio_buffer.speech_started":
+                player.clear(); user_turns += 1
+                if stage == "greeting" and user_turns >= 2:
+                    stage = "scheduling"; await set_stage("scheduling")
+            elif et == "response.output_audio_transcript.done":
+                print(f"[{t}] Riverside: {e.get('transcript', '')}")
+            elif et == "response.output_item.done":
+                item = e.get("item", {})
+                if item.get("type") == "function_call" and not pending:
+                    pending = {"call_id": item["call_id"], "arguments": json.loads(item.get("arguments", "{}"))}
+                    print(f"[{t}] 🔧 booking...")
+            elif et == "response.done":
+                if pending:
+                    stage = "booked"; await set_stage("booked")
+                    result = run_tool("book_appointment", pending["arguments"])
+                    if result.get("confirmed"): print(f"[{t}] ✅ {result['summary']}")
+                    else:
+                        print(f"[{t}] ❌ {result.get('error')}"); stage = "scheduling"; await set_stage("scheduling")
+                    await conn.conversation.item.create(item={
+                        "id": f"item_{uuid.uuid4().hex[:24]}", "type": "function_call_output",
+                        "call_id": pending["call_id"], "output": json.dumps(result),
+                    })
+                    pending = None
+                elif stage == "greeting" and e.get("response", {}).get("status") == "completed" and user_turns >= 1:
+                    stage = "scheduling"; await set_stage("scheduling")
+            elif et == "error": print(f"[{t}] Error: {e.get('error', {})}")
+
+    print(f"\n🏥 Riverside ENT Associates\nToday: {TODAY.strftime('%A, %B %d, %Y')}\n")
+    try:
+        await asyncio.gather(stream_mic(), handle_events())
+    except KeyboardInterrupt:
+        pass
+    finally:
+        player.close(); await conn.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+</Accordion>
+
+</Tab>
+
+</Tabs>
+
+---
+
+## Iterate fast with Claude Code
+
+The best way to build and tune voice agents is with an AI coding assistant that already understands the API. Drop a `CLAUDE.md` into your project and [Claude Code](https://claude.ai/code) will handle the boilerplate so you can focus on the agent behavior.
+
+<Steps>
+
+<Step title="Install Claude Code">
+```bash
+npm install -g @anthropic-ai/claude-code
+```
+</Step>
+
+<Step title="Start from an example above">
+Copy one of the agent scripts above into your project. Running it first means Claude Code has a concrete starting point to build on.
+</Step>
+
+<Step title="Drop in a CLAUDE.md">
+Create `CLAUDE.md` at the root of your project with the context below. Claude Code reads this automatically and will know exactly how the API works — no copy-pasting docs into every prompt.
+
+```markdown CLAUDE.md
+# AssemblyAI Speech-to-Speech API
+
+## Connection sequence (critical — order matters)
+1. Connect WebSocket to `wss://speech-to-speech.assemblyai.com/v1`
+2. Send `session.update` IMMEDIATELY — do NOT wait for `session.created`
+3. Start streaming mic audio right away (server needs audio to warm up)
+4. When `session.created` arrives: call `input_audio_buffer.clear()`, drain mic queue
+5. From here, fresh audio flows with config applied
+
+## OpenAI SDK (Python) session config
+```python
+await connection.session.update(session={
+    "type": "realtime",            # required for nested format
+    "instructions": "...",
+    "output_modalities": ["audio", "text"],
+    "audio": {
+        "input": {
+            "format": {"type": "audio/pcm", "rate": 24000},
+            "transcription": {"model": "universal-streaming"},
+            "turn_detection": {
+                "type": "server_vad",
+                "threshold": 0.5,      # raise to 0.6-0.7 if triggering on noise
+                "prefix_padding_ms": 300,
+                "silence_duration_ms": 200,  # raise to 400-500 if interrupting too early
+            },
+        },
+        "output": {"format": {"type": "audio/pcm", "rate": 24000}, "voice": "sage"},
+    },
+    "tools": [...],
+    "tool_choice": "auto",
+})
+```
+
+## Tool calling (capture → wait → inject)
+```python
+pending_calls = []
+
+# Capture on response.output_item.done (not response.function_call_arguments.done)
+if et == "response.output_item.done":
+    item = e.get("item", {})
+    if item.get("type") == "function_call":
+        pending_calls.append({"name": item["name"], "call_id": item["call_id"],
+                               "arguments": json.loads(item.get("arguments", "{}"))})
+
+# Inject on response.done — server auto-generates follow-up, no response.create needed
+elif et == "response.done":
+    for call in pending_calls:
+        result = run_tool(call["name"], call["arguments"])
+        await connection.conversation.item.create(item={
+            "id": f"item_{uuid.uuid4().hex[:24]}",  # id field is REQUIRED
+            "type": "function_call_output",
+            "call_id": call["call_id"],
+            "output": json.dumps(result),
+        })
+    pending_calls.clear()
+```
+
+## Stage-based prompting
+Update instructions mid-session to keep the agent focused at each phase:
+```python
+await connection.session.update(session={
+    "type": "realtime",
+    "instructions": new_stage_instructions,
+})
+```
+Do this:
+- After the opening exchange (greeting → main task)
+- Before injecting a tool result (so post-tool speech uses focused instructions)
+- When recovering from an error (reset back to earlier stage)
+
+## AudioPlayer — must be callback-based
+Never use blocking `write()` on the event loop — it starves mic sending and delays everything:
+```python
+class AudioPlayer:
+    def _callback(self, outdata, frames, *_):  # runs in audio thread — non-blocking
+        n = frames * 2
+        with self._lock:
+            chunk = bytes(self._buf[:n]); del self._buf[:n]
+        outdata[:] = chunk + b'\x00' * (n - len(chunk))
+
+    def play(self, pcm: bytes):  # returns instantly — just appends
+        with self._lock: self._buf.extend(pcm)
+
+    def clear(self):  # call on speech_started to stop agent audio immediately
+        with self._lock: self._buf.clear()
+```
+
+## Mic streaming
+Use `queue.Queue` (thread-safe) + `run_in_executor` to bridge mic to async:
+```python
+q = queue.Queue()
+# mic callback: q.put_nowait(bytes(data))
+
+async def stream_mic():
+    while True:
+        pcm = await loop.run_in_executor(None, q.get)  # non-blocking wait
+        await raw_ws.send(json.dumps({...}))
+```
+
+Use `raw_ws = connection._connection` and send via `raw_ws.send()` directly for audio
+— bypasses SDK Pydantic overhead on every 20ms chunk.
+
+## Common pitfalls
+- ❌ Sending config on `session.created` — send it BEFORE, immediately after connect
+- ❌ Using `asyncio.Queue` in threads — it's not thread-safe, use `queue.Queue`
+- ❌ Blocking `write()` for audio playback — starves the event loop
+- ❌ Chaining tool calls across multiple responses — process all at once in `response.done`
+- ❌ Missing `id` field in `function_call_output` — breaks tool result association
+- ❌ Large chunks (100ms+) — adds VAD latency. Keep to ~20ms (480 samples at 24kHz)
+```
+</Step>
+
+<Step title="Ask Claude to build or modify your agent">
+```
+> Change the pizza agent persona to a fast food drive-through, add a combo meals tool,
+  and make it push upsells for drinks
+```
+```
+> Add a check_calendar tool to the ENT agent that returns real availability
+  from a Google Calendar API call
+```
+```
+> Tune the VAD settings — the agent keeps interrupting me mid-sentence
+```
+
+Claude Code reads your CLAUDE.md, understands the session init sequence and tool calling pattern, and makes changes that actually work.
+</Step>
+
+</Steps>
+
+<Tip>
+Start with one of the example agents above, not a blank file. Claude Code iterates much faster when it has a running baseline to modify rather than generating from scratch.
+</Tip>
 
 ---
 
@@ -157,10 +1136,10 @@ Replace `YOUR_ASSEMBLYAI_API_KEY` with your key from the [AssemblyAI dashboard](
 Client                                     Server
   |                                           |
   |--- WebSocket connect -------------------->|
-  |--- session.update (config) -------------->|  send immediately after connect
-  |--- input_audio_buffer.append ------------>|  stream mic audio (start right away)
+  |--- session.update (config) -------------->|  ← send immediately, don't wait
+  |--- input_audio_buffer.append ------------>|  ← start streaming right away
   |                                           |
-  |<------------ session.created -------------|  clear buffer here
+  |<------------ session.created -------------|  ← clear buffer here
   |<------------ speech_started --------------|  user is talking
   |<------------ speech_stopped --------------|  user finished
   |<------------ transcription.completed -----|  what the user said
@@ -169,58 +1148,115 @@ Client                                     Server
   |                                           |
 ```
 
-1. **Connect** — Open a WebSocket to `wss://speech-to-speech.assemblyai.com/v1/realtime` with your API key in the `Authorization: Bearer` header.
-2. **Configure** — Send `session.update` **immediately** after connecting (before `session.created` arrives). Don't wait.
-3. **Stream audio** — Start streaming mic audio right away. The server warms up on this audio before the session is fully ready.
-4. **Clear on `session.created`** — When `session.created` arrives, call `input_audio_buffer.clear()` to discard the warmup audio. From this point, fresh audio flows with your config applied.
-5. **Receive responses** — The server transcribes user speech, generates a response, and streams audio and text in real time.
+<Warning>
+**Send config immediately — don't wait for `session.created`.** The server needs audio before it sends `session.created`. Sending config immediately means your instructions and turn detection settings are ready the moment the session is live. When `session.created` arrives, call `input_audio_buffer.clear()` to discard warmup audio and start fresh.
+</Warning>
+
+The API is fully compatible with the OpenAI Realtime protocol, so the [OpenAI Python SDK](https://github.com/openai/openai-python), [OpenAI JS SDK](https://github.com/openai/openai-node), [LiveKit Agents](https://docs.livekit.io/agents/), and any OpenAI-compatible client work out of the box — just point them at `wss://speech-to-speech.assemblyai.com/v1`.
 
-The API is fully compatible with the OpenAI Realtime protocol, so the [OpenAI Python SDK](https://github.com/openai/openai-python), [LiveKit Agents](https://docs.livekit.io/agents/), and any OpenAI-compatible client work out of the box — just point them at `wss://speech-to-speech.assemblyai.com/v1`.
+---
+
+## Tool calling
+
+Give your agent the ability to call functions — look up data, take actions, call external APIs — then continue the conversation with the result.
+
+### Define tools
+
+```json
+"tools": [{
+  "type": "function",
+  "name": "check_availability",
+  "description": "Check available appointment slots for a given date",
+  "parameters": {
+    "type": "object",
+    "properties": {
+      "date": {"type": "string", "description": "Date in YYYY-MM-DD format"}
+    },
+    "required": ["date"]
+  }
+}],
+"tool_choice": "auto"
+```
+
+### Handle tool calls
+
+Capture calls via `response.output_item.done`, then inject results on `response.done`. The server auto-generates the follow-up response — no need to call `response.create`.
+
+```python
+import uuid
+
+pending_calls = []
+
+# Step 1: capture when the output item is fully formed
+if et == "response.output_item.done":
+    item = e.get("item", {})
+    if item.get("type") == "function_call":
+        pending_calls.append({
+            "name": item["name"],
+            "call_id": item["call_id"],
+            "arguments": json.loads(item.get("arguments", "{}")),
+        })
+
+# Step 2: execute and inject on response.done
+elif et == "response.done":
+    if pending_calls:
+        calls = pending_calls[:]
+        pending_calls.clear()
+        for call in calls:
+            result = run_tool(call["name"], call["arguments"])
+            await connection.conversation.item.create(item={
+                "id": f"item_{uuid.uuid4().hex[:24]}",  # required
+                "type": "function_call_output",
+                "call_id": call["call_id"],
+                "output": json.dumps(result),
+            })
+```
+
+<Tip>
+Always include the `"id"` field in `function_call_output`. Without it, the server may not properly associate the result with the function call.
+</Tip>
+
+### Stage-based prompting
+
+For multi-step agents, update the instructions at each phase of the conversation. This keeps the model focused and prevents it from repeating information it already said.
+
+<Accordion title="How stage-based prompting works">
+
+Instead of one long system prompt that covers every possible phase, split your agent into stages and update the instructions as the conversation progresses:
+
+```python
+STAGES = {
+    "greeting": "Answer the phone. Find out what the caller needs. No tools yet.",
+    "main_task": "You have all the context now. Take the action and gather any remaining details.",
+    "wrap_up": "The task is done. Confirm the result briefly and say bye. No more tools.",
+}
 
----
+async def set_stage(name: str):
+    await connection.session.update(session={
+        "type": "realtime",
+        "instructions": STAGES[name],
+    })
 
-## Agent generator
+# In your event handler:
+# After user speaks a couple times → set_stage("main_task")
+# Before injecting tool result → set_stage("wrap_up")   ← critical timing
+```
 
-Describe your agent and we'll generate the complete code — system prompt, tool definitions, and a runnable script.
+**The critical timing:** switch to the wrap-up stage *before* injecting the tool result, not after. The agent responds to the tool result using the *current* instructions at that moment. If you switch to "just give them the confirmation, no repeating" before it sees the result, the follow-up response will be tight and focused.
 
-<AgentGenerator />
+</Accordion>
 
 ---
 
 ## Configuration
 
-Configure your session by sending a `session.update` event immediately after connecting. The API accepts two session formats depending on your integration approach.
+### Session format
 
-### Flat format (Raw WebSocket)
-
-```json
-{
-  "type": "session.update",
-  "session": {
-    "instructions": "You are a helpful voice assistant.",
-    "voice": "sage",
-    "input_audio_format": "pcm16",
-    "input_audio_sample_rate": 24000,
-    "output_audio_format": "pcm16",
-    "output_audio_sample_rate": 24000,
-    "input_audio_transcription": {"model": "universal-streaming"},
-    "output_modalities": ["audio", "text"],
-    "turn_detection": {
-      "type": "server_vad",
-      "threshold": 0.5,
-      "prefix_padding_ms": 300,
-      "silence_duration_ms": 200,
-      "create_response": true
-    },
-    "tools": [],
-    "tool_choice": "auto"
-  }
-}
-```
+<Tabs>
 
-### Nested format (OpenAI SDK / LiveKit)
+<Tab title="OpenAI SDK (nested)">
 
-The OpenAI GA SDK and LiveKit plugin use a nested session format. Include `"type": "realtime"` in the session object.
+Used by the OpenAI Python/JS SDK and LiveKit. Include `"type": "realtime"`.
 
 ```json
 {
@@ -237,8 +1273,7 @@ The OpenAI GA SDK and LiveKit plugin use a nested session format. Include `"type
           "type": "server_vad",
           "threshold": 0.5,
           "prefix_padding_ms": 300,
-          "silence_duration_ms": 200,
-          "create_response": true
+          "silence_duration_ms": 200
         }
       },
       "output": {
@@ -252,34 +1287,53 @@ The OpenAI GA SDK and LiveKit plugin use a nested session format. Include `"type
 }
 ```
 
-### Session parameters
+</Tab>
 
-<ParamField path="instructions" type="string">
-  System prompt for the AI agent. Defines personality, behavior, and constraints.
-</ParamField>
+<Tab title="Raw WebSocket (flat)">
 
-<ParamField path="voice" type="string" default="sage">
-  Voice for agent audio responses. One of: `sage`, `ember`, `breeze`, `cascade`.
-</ParamField>
+Used when connecting with raw WebSocket.
 
-<ParamField path="input_audio_format" type="string" default="pcm16">
-  Input audio encoding. Use `pcm16` (signed 16-bit little-endian).
-</ParamField>
+```json
+{
+  "type": "session.update",
+  "session": {
+    "instructions": "You are a helpful voice assistant.",
+    "voice": "sage",
+    "input_audio_format": "pcm16",
+    "input_audio_sample_rate": 24000,
+    "output_audio_format": "pcm16",
+    "output_audio_sample_rate": 24000,
+    "input_audio_transcription": {"model": "universal-streaming"},
+    "output_modalities": ["audio", "text"],
+    "turn_detection": {
+      "type": "server_vad",
+      "threshold": 0.5,
+      "prefix_padding_ms": 300,
+      "silence_duration_ms": 200,
+      "create_response": true
+    },
+    "tools": [],
+    "tool_choice": "auto"
+  }
+}
+```
 
-<ParamField path="input_audio_sample_rate" type="integer" default="24000">
-  Input audio sample rate in Hz.
-</ParamField>
+</Tab>
+
+</Tabs>
+
+### Session parameters
 
-<ParamField path="output_audio_format" type="string" default="pcm16">
-  Output audio encoding. Use `pcm16` (signed 16-bit little-endian).
+<ParamField path="instructions" type="string">
+  System prompt. Defines the agent's personality, role, and behavior. Can be updated mid-session with `session.update` to implement stage-based prompting.
 </ParamField>
 
-<ParamField path="output_audio_sample_rate" type="integer" default="24000">
-  Output audio sample rate in Hz.
+<ParamField path="voice" type="string" default="sage">
+  Voice for agent responses. Options: `sage`, `ember`, `breeze`, `cascade`.
 </ParamField>
 
 <ParamField path="output_modalities" type="array">
-  What the agent returns. Include `"audio"` for spoken responses and `"text"` for transcripts.
+  What the agent returns. `["audio", "text"]` gives you both spoken responses and transcripts.
 </ParamField>
 
 <ParamField path="input_audio_transcription" type="object">
@@ -287,33 +1341,14 @@ The OpenAI GA SDK and LiveKit plugin use a nested session format. Include `"type
 </ParamField>
 
 <ParamField path="turn_detection" type="object">
-  Server-side voice activity detection. See [Turn detection](#turn-detection).
+  Server-side voice activity detection settings.
 </ParamField>
 
 <ParamField path="tools" type="array" default="[]">
-  Functions the agent can call. See [Tool calling](#tool-calling).
-</ParamField>
-
-<ParamField path="tool_choice" type="string" default="auto">
-  When to use tools. `"auto"` lets the model decide.
+  Function definitions the agent can call.
 </ParamField>
 
-### Audio format
-
-All audio is **PCM16** (signed 16-bit integer, little-endian), **mono**, **24,000 Hz**. Audio is base64-encoded inside JSON messages. Each chunk should be approximately 20 ms (480 samples, 960 bytes).
-
-### Voices
-
-| Voice | ID |
-|-------|----|
-| Sage | `sage` |
-| Ember | `ember` |
-| Breeze | `breeze` |
-| Cascade | `cascade` |
-
-### Turn detection
-
-The server automatically detects when the user starts and stops speaking using voice activity detection (VAD). When the user finishes a turn, the agent responds automatically.
+### Turn detection tuning
 
 ```json
 "turn_detection": {
@@ -325,148 +1360,52 @@ The server automatically detects when the user starts and stops speaking using v
 }
 ```
 
-<ParamField path="type" type="string" required>
-  Set to `"server_vad"` for server-side voice activity detection.
-</ParamField>
-
 <ParamField path="threshold" type="float" default="0.5">
-  Speech detection sensitivity (0.0 to 1.0). Lower values detect quieter speech. If the agent is triggering on background noise, raise this to `0.6`–`0.7`.
+  Speech sensitivity (0.0–1.0). **Raise to 0.6–0.7** if the agent is triggering on background noise or its own audio.
 </ParamField>
 
 <ParamField path="prefix_padding_ms" type="integer" default="300">
-  Audio to preserve before speech onset, in milliseconds. Prevents clipping the start of a sentence.
+  Audio preserved before speech onset. Prevents clipping the start of sentences.
 </ParamField>
 
 <ParamField path="silence_duration_ms" type="integer" default="200">
-  How long the user must pause before the server considers them done speaking, in milliseconds. Raise to `400`–`500` if the agent interrupts too eagerly.
+  Pause length before considering a turn complete. **Raise to 400–500ms** if the agent interrupts before the user has finished speaking.
 </ParamField>
 
 <ParamField path="create_response" type="boolean" default="true">
-  Automatically generate an agent response when the user finishes speaking.
+  Auto-generate a response when the user finishes their turn.
 </ParamField>
 
----
-
-## Tool calling
-
-Give your agent the ability to call functions in your application — look up data, take actions, or call external APIs — then continue the conversation with the result.
-
-### Define tools in your session config
-
-```json
-"tools": [{
-  "type": "function",
-  "name": "get_weather",
-  "description": "Get the current weather for a location",
-  "parameters": {
-    "type": "object",
-    "properties": {
-      "location": {"type": "string", "description": "City name"}
-    },
-    "required": ["location"]
-  }
-}],
-"tool_choice": "auto"
-```
-
-### Handle tool calls
-
-When the agent decides to call a function, it emits `response.output_item.done` with the complete call (name, arguments, call ID). Capture it there, then execute and return the result when `response.done` arrives.
-
-```python
-import uuid
-
-pending_calls = []  # tool calls from the current response
-
-async for data in event_stream:
-    e = json.loads(data)
-    et = e.get("type", "")
-
-    # Capture tool calls when the output item is fully done
-    if et == "response.output_item.done":
-        item = e.get("item", {})
-        if item.get("type") == "function_call":
-            pending_calls.append({
-                "name": item["name"],
-                "call_id": item["call_id"],
-                "arguments": json.loads(item.get("arguments", "{}")),
-            })
-
-    # Execute and return results when the response is complete
-    elif et == "response.done":
-        if pending_calls:
-            calls = pending_calls[:]
-            pending_calls.clear()
-            for call in calls:
-                result = run_tool(call["name"], call["arguments"])
-                await connection.conversation.item.create(item={
-                    "id": f"item_{uuid.uuid4().hex[:24]}",  # required
-                    "type": "function_call_output",
-                    "call_id": call["call_id"],
-                    "output": json.dumps(result),
-                })
-
-    elif et == "response.output_audio.delta":
-        player.play(base64.b64decode(e["delta"]))
-```
-
-The pattern is: **capture the call** via `response.output_item.done` → **wait for `response.done`** → **send all results**. The server auto-generates the follow-up response after receiving the tool output — no need to call `response.create`.
-
-<Tip>
-Include an `"id"` field in every `function_call_output` item. Without it, the server may not properly associate the result with the function call.
-</Tip>
-
----
-
-## Events reference
-
-### Client → Server
+### Voices
 
-| Event | Description | Key fields |
-|-------|-------------|------------|
-| `session.update` | Configure the session | `session`: configuration object |
-| `input_audio_buffer.append` | Stream an audio chunk | `audio`: base64-encoded PCM16 |
-| `input_audio_buffer.commit` | Commit buffered audio as a user turn | — |
-| `input_audio_buffer.clear` | Discard buffered audio | — |
-| `conversation.item.create` | Add a message or tool result | `item`: conversation item |
-| `conversation.item.delete` | Remove a conversation item | `item_id`: ID to remove |
-| `response.create` | Trigger the agent to respond | — |
+| Voice | ID | Character |
+|-------|----|-----------|
+| Sage | `sage` | Warm, measured, professional |
+| Ember | `ember` | Expressive, energetic |
+| Breeze | `breeze` | Conversational, approachable |
+| Cascade | `cascade` | Clear, authoritative |
 
-### Server → Client
+### Audio format
 
-| Event | Description | Key fields |
-|-------|-------------|------------|
-| `session.created` | Session initialized | `session.id` |
-| `session.updated` | Session config applied | `session` |
-| `input_audio_buffer.speech_started` | User started speaking | `audio_start_ms` |
-| `input_audio_buffer.speech_stopped` | User stopped speaking | `audio_end_ms` |
-| `input_audio_buffer.committed` | Audio committed as a turn | — |
-| `conversation.item.created` | New conversation item added | `item` |
-| `conversation.item.input_audio_transcription.completed` | User speech transcribed | `transcript` |
-| `response.created` | Agent started generating a response | — |
-| `response.output_item.done` | Output item complete (incl. function calls) | `item` |
-| `response.output_audio.delta` | Agent audio chunk | `delta`: base64 PCM16 |
-| `response.output_audio.done` | Agent audio complete | — |
-| `response.output_audio_transcript.delta` | Agent text (streaming) | `delta` |
-| `response.output_audio_transcript.done` | Agent text (final) | `transcript` |
-| `response.done` | Response complete | `response.status`: `completed` or `cancelled` |
-| `error` | Error occurred | `error.message` |
+All audio is **PCM16** (signed 16-bit little-endian), **mono**, **24,000 Hz**, base64-encoded in JSON. Send chunks of approximately 20ms — 480 samples, 960 bytes raw, ~1280 bytes base64.
 
 ---
 
-## Complete examples
-
-Production-ready examples for three integration approaches. Each handles microphone input, speaker output, turn detection, transcription, interruptions, and tool calling.
+## SDK and framework examples
 
-### OpenAI Python SDK
+<Tabs>
 
-The recommended approach. Uses the OpenAI GA Realtime API with `client.realtime.connect()` and the nested session format.
+<Tab title="Python (OpenAI SDK)" default>
 
 ```bash
 pip install openai sounddevice
 ```
 
-```python
+The recommended approach. Full production example with native-rate mic capture, callback-based audio player, interruption handling, and correct tool calling.
+
+<Accordion title="Full example">
+
+```python agent.py
 import asyncio, base64, json, queue, struct, threading, time, uuid
 import sounddevice as sd
 from openai import AsyncOpenAI
@@ -498,219 +1437,355 @@ def run_tool(name, args):
 
 
 class AudioPlayer:
-    """Callback-based player — the audio thread pulls from the buffer,
-    so play() returns instantly and never blocks the event loop."""
+    """Callback-based — play() returns instantly, audio thread does the work."""
     def __init__(self):
-        self._buf = bytearray()
-        self._lock = threading.Lock()
-        self._out = sd.RawOutputStream(
-            samplerate=TARGET_RATE, channels=1, dtype="int16",
-            blocksize=480, latency="low", callback=self._callback,
-        )
+        self._buf, self._lock = bytearray(), threading.Lock()
+        self._out = sd.RawOutputStream(samplerate=TARGET_RATE, channels=1, dtype="int16",
+            blocksize=480, latency="low", callback=self._callback)
         self._out.start()
 
-    def _callback(self, outdata, frames, time_info, status):
+    def _callback(self, outdata, frames, *_):
         n = frames * 2
         with self._lock:
-            chunk = bytes(self._buf[:n])
-            del self._buf[:n]
-        if len(chunk) < n:
-            chunk += b'\x00' * (n - len(chunk))
-        outdata[:] = chunk
+            chunk = bytes(self._buf[:n]); del self._buf[:n]
+        outdata[:] = chunk + b'\x00' * (n - len(chunk))
 
     def play(self, pcm: bytes):
-        with self._lock:
-            self._buf.extend(pcm)
+        with self._lock: self._buf.extend(pcm)
 
-    def clear(self):
-        """Stop agent audio immediately (call on user interruption)."""
-        with self._lock:
-            self._buf.clear()
+    def clear(self):  # call on speech_started to stop agent audio immediately
+        with self._lock: self._buf.clear()
 
-    def close(self):
-        self._out.stop(); self._out.close()
+    def close(self): self._out.stop(); self._out.close()
 
 
 async def main():
-    player = AudioPlayer()
-    q = queue.Queue()
-
-    # Capture at the device's native sample rate to avoid hidden driver
-    # resampling buffers. We resample to 24kHz ourselves before sending.
-    device_info = sd.query_devices(kind="input")
-    native_rate = int(device_info["default_samplerate"])
+    player, q = AudioPlayer(), queue.Queue()
 
-    def mic_cb(data, frames, ti, status):
-        q.put_nowait(bytes(data))
+    # Capture at native rate — avoids hidden driver resampling buffers
+    dev = sd.query_devices(kind="input")
+    native_rate = int(dev["default_samplerate"])
 
-    mic = sd.RawInputStream(
-        samplerate=native_rate, channels=1, dtype="int16",
-        blocksize=int(native_rate * 0.02),  # ~20ms chunks
-        callback=mic_cb, latency="low",
-    )
-    mic.start()
+    def mic_cb(data, frames, ti, status): q.put_nowait(bytes(data))
+    sd.RawInputStream(samplerate=native_rate, channels=1, dtype="int16",
+        blocksize=int(native_rate * 0.02), callback=mic_cb, latency="low").start()
 
-    connection = await client.realtime.connect(
+    conn = await client.realtime.connect(
         model="universal-streaming",
         websocket_connection_options={"compression": None},
     ).enter()
 
     # Send config IMMEDIATELY — don't wait for session.created
-    await connection.session.update(session={
+    await conn.session.update(session={
         "type": "realtime",
         "instructions": "You are a helpful voice assistant. Keep responses brief.",
         "output_modalities": ["audio", "text"],
         "audio": {
-            "input": {
-                "format": {"type": "audio/pcm", "rate": TARGET_RATE},
-                "transcription": {"model": "universal-streaming"},
-                "turn_detection": {
-                    "type": "server_vad", "threshold": 0.5,
-                    "prefix_padding_ms": 300, "silence_duration_ms": 200,
-                },
-            },
-            "output": {
-                "format": {"type": "audio/pcm", "rate": TARGET_RATE},
-                "voice": "sage",
-            },
+            "input": {"format": {"type": "audio/pcm", "rate": TARGET_RATE},
+                      "transcription": {"model": "universal-streaming"},
+                      "turn_detection": {"type": "server_vad", "threshold": 0.5,
+                                         "prefix_padding_ms": 300, "silence_duration_ms": 200}},
+            "output": {"format": {"type": "audio/pcm", "rate": TARGET_RATE}, "voice": "sage"},
         },
-        "tools": TOOLS,
-        "tool_choice": "auto",
+        "tools": TOOLS, "tool_choice": "auto",
     })
 
-    # Use the underlying websocket for audio sends — bypasses SDK
-    # serialization overhead on every chunk
-    raw_ws = connection._connection
+    raw_ws = conn._connection
     loop = asyncio.get_running_loop()
+    while not q.empty(): q.get_nowait()  # drain stale mic audio
 
-    # Drain mic audio that accumulated during the connection handshake
-    while not q.empty():
-        q.get_nowait()
-
-    def resample(pcm: bytes, src_rate: int) -> bytes:
-        """Linear interpolation resample to TARGET_RATE."""
-        if src_rate == TARGET_RATE:
-            return pcm
-        n = len(pcm) // 2
-        samples = struct.unpack(f"<{n}h", pcm)
-        ratio = src_rate / TARGET_RATE
-        out_len = int(n / ratio)
-        out = []
-        for i in range(out_len):
-            si = i * ratio
-            idx = int(si)
-            frac = si - idx
-            s1 = samples[min(idx, n - 1)]
-            s2 = samples[min(idx + 1, n - 1)]
-            out.append(int(s1 + frac * (s2 - s1)))
-        return struct.pack(f"<{out_len}h", *out)
+    def resample(pcm: bytes, src: int) -> bytes:
+        if src == TARGET_RATE: return pcm
+        n = len(pcm) // 2; samps = struct.unpack(f"<{n}h", pcm); r = src / TARGET_RATE
+        out = [int(samps[min(int(i*r), n-1)] + (samps[min(int(i*r)+1, n-1)] -
+               samps[min(int(i*r), n-1)]) * (i*r - int(i*r))) for i in range(int(n / r))]
+        return struct.pack(f"<{len(out)}h", *out)
 
     async def stream_mic():
         while True:
-            # run_in_executor waits on the thread-safe queue without
-            # blocking the event loop
             pcm = await loop.run_in_executor(None, q.get)
             buf = bytearray(pcm)
             while not q.empty():
-                try:
-                    buf.extend(q.get_nowait())
-                except Exception:
-                    break
-            await raw_ws.send(json.dumps({
-                "type": "input_audio_buffer.append",
-                "audio": base64.b64encode(resample(bytes(buf), native_rate)).decode(),
-            }))
+                try: buf.extend(q.get_nowait())
+                except: break
+            await raw_ws.send(json.dumps({"type": "input_audio_buffer.append",
+                "audio": base64.b64encode(resample(bytes(buf), native_rate)).decode()}))
 
     async def handle_events():
         pending_calls = []
         while True:
-            data = await connection.recv_bytes()
-            e = json.loads(data.decode("utf-8"))
+            e = json.loads((await conn.recv_bytes()).decode())
             et = e.get("type", "")
 
-            # Fast path: audio deltas — decode and play with minimal overhead
             if et == "response.output_audio.delta":
-                player.play(base64.b64decode(e["delta"]))
-                continue
+                player.play(base64.b64decode(e["delta"])); continue
 
             t = time.strftime("%H:%M:%S")
-
             if et == "session.created":
-                print(f"[{t}] Connected — session {e['session']['id']}")
-                # Clear warmup audio — fresh audio flows from here
-                await connection.input_audio_buffer.clear()
-
+                print(f"[{t}] Connected — {e['session']['id']}")
+                await conn.input_audio_buffer.clear()
             elif et == "input_audio_buffer.speech_started":
-                print(f"[{t}] You started speaking")
-                player.clear()  # stop agent audio immediately on interruption
-
-            elif et == "input_audio_buffer.speech_stopped":
-                print(f"[{t}] You stopped speaking")
-
+                print(f"[{t}] You speaking"); player.clear()
             elif et == "conversation.item.input_audio_transcription.completed":
-                txt = e.get("transcript", "")
-                if txt:
-                    print(f"[{t}] You:   {txt}")
-
+                if txt := e.get("transcript", ""): print(f"[{t}] You:   {txt}")
             elif et == "response.output_audio_transcript.done":
                 print(f"[{t}] Agent: {e.get('transcript', '')}")
-
-            # Capture tool calls when the output item is fully formed
             elif et == "response.output_item.done":
                 item = e.get("item", {})
                 if item.get("type") == "function_call":
-                    call_id = item.get("call_id", "")
+                    call_id = item["call_id"]
                     if not any(c["call_id"] == call_id for c in pending_calls):
-                        pending_calls.append({
-                            "name": item.get("name", ""),
-                            "call_id": call_id,
-                            "arguments": json.loads(item.get("arguments", "{}")),
-                        })
-                        print(f"[{t}] Tool:  {item['name']}({item.get('arguments', '{}')})")
-
+                        pending_calls.append({"name": item["name"], "call_id": call_id,
+                                              "arguments": json.loads(item.get("arguments", "{}"))})
+                        print(f"[{t}] Tool: {item['name']}({item.get('arguments', '{}')})")
             elif et == "response.done":
-                s = e.get("response", {}).get("status", "?")
-                print(f"[{t}] Done ({s})")
-                # Send all tool results — server auto-generates the follow-up response
                 if pending_calls:
-                    calls = pending_calls[:]
-                    pending_calls.clear()
+                    calls = pending_calls[:]; pending_calls.clear()
                     for call in calls:
                         result = run_tool(call["name"], call["arguments"])
-                        print(f"[{t}]   → {call['name']}: {json.dumps(result)[:80]}")
-                        await connection.conversation.item.create(item={
+                        print(f"[{t}]   → {json.dumps(result)[:80]}")
+                        await conn.conversation.item.create(item={
                             "id": f"item_{uuid.uuid4().hex[:24]}",
                             "type": "function_call_output",
                             "call_id": call["call_id"],
                             "output": json.dumps(result),
                         })
-
             elif et == "error":
                 print(f"[{t}] Error: {e.get('error', {})}")
 
-    print(f"Listening — start talking.\n")
+    print("Listening — start talking.\n")
     try:
         await asyncio.gather(stream_mic(), handle_events())
     except KeyboardInterrupt:
         pass
     finally:
-        mic.stop(); mic.close(); player.close()
-        await connection.close()
+        player.close(); await conn.close()
 
 
 if __name__ == "__main__":
     asyncio.run(main())
 ```
 
-### Raw WebSocket
+</Accordion>
+
+</Tab>
+
+<Tab title="JavaScript (Browser)">
+
+```bash
+npm install openai
+```
+
+Browser-based voice agent using Web Audio API for low-latency, gapless playback. Connect via your own proxy server (browsers can't set auth headers on WebSocket — see the proxy pattern below).
+
+<Accordion title="Full browser example">
+
+```javascript agent.js
+// Browser voice agent — connect to a proxy that adds your AssemblyAI API key
+// Proxy endpoint: wss://your-server.com/api/s2s
+
+const WS_PROXY_URL = 'wss://your-server.com/api/s2s';
+
+// ── Audio playback ──────────────────────────────────────────────────────────
+// Web Audio API with precise scheduling: buffers play back-to-back with no gaps
+const audioCtx = new AudioContext({ sampleRate: 24000 });
+const gainNode = audioCtx.createGain();
+gainNode.connect(audioCtx.destination);
+let nextPlayTime = 0;
+
+function decodeAndPlay(base64Delta) {
+  const binary = atob(base64Delta);
+  const bytes = new Uint8Array(binary.length);
+  for (let i = 0; i < binary.length; i++) bytes[i] = binary.charCodeAt(i);
+
+  const int16 = new Int16Array(bytes.buffer);
+  const float32 = new Float32Array(int16.length);
+  for (let i = 0; i < int16.length; i++) float32[i] = int16[i] / 32768;
+
+  const buffer = audioCtx.createBuffer(1, float32.length, 24000);
+  buffer.getChannelData(0).set(float32);
+
+  const source = audioCtx.createBufferSource();
+  source.buffer = buffer;
+  source.connect(gainNode);
+
+  const now = audioCtx.currentTime;
+  if (nextPlayTime < now) nextPlayTime = now; // reset if fallen behind
+  source.start(nextPlayTime);
+  nextPlayTime += buffer.duration;
+}
+
+function stopPlayback() {
+  // Recreate AudioContext to immediately stop all scheduled audio (interruption)
+  gainNode.disconnect();
+  const newCtx = new AudioContext({ sampleRate: 24000 });
+  const newGain = newCtx.createGain();
+  newGain.connect(newCtx.destination);
+  Object.assign(audioCtx, newCtx);
+  Object.assign(gainNode, newGain);
+  nextPlayTime = 0;
+}
+
+// ── Mic capture ─────────────────────────────────────────────────────────────
+async function startMic(onChunk) {
+  const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+  const ctx = new AudioContext(); // capture at native rate
+  const source = ctx.createMediaStreamSource(stream);
+  const processor = ctx.createScriptProcessor(2048, 1, 1);
+
+  processor.onaudioprocess = (e) => {
+    const inputData = e.inputBuffer.getChannelData(0);
+
+    // Resample from native rate to 24kHz
+    const ratio = ctx.sampleRate / 24000;
+    const outLength = Math.floor(inputData.length / ratio);
+    const pcm16 = new Int16Array(outLength);
+    for (let i = 0; i < outLength; i++) {
+      const srcIdx = i * ratio;
+      const idx = Math.floor(srcIdx);
+      const frac = srcIdx - idx;
+      const s1 = inputData[idx] || 0;
+      const s2 = inputData[Math.min(idx + 1, inputData.length - 1)] || s1;
+      const sample = Math.max(-1, Math.min(1, s1 + frac * (s2 - s1)));
+      pcm16[i] = sample < 0 ? sample * 0x8000 : sample * 0x7FFF;
+    }
+
+    // Convert to base64
+    const bytes = new Uint8Array(pcm16.buffer);
+    let binary = '';
+    for (let i = 0; i < bytes.length; i++) binary += String.fromCharCode(bytes[i]);
+    onChunk(btoa(binary));
+  };
+
+  source.connect(processor);
+  processor.connect(ctx.destination);
+  return { stop: () => { processor.disconnect(); stream.getTracks().forEach(t => t.stop()); } };
+}
+
+// ── Main ─────────────────────────────────────────────────────────────────────
+async function startAgent() {
+  // Resume AudioContext on user gesture (browser autoplay policy)
+  await audioCtx.resume();
+
+  const ws = new WebSocket(WS_PROXY_URL);
+  let sessionReady = false;
+
+  ws.onopen = () => {
+    // Send config immediately on open
+    ws.send(JSON.stringify({
+      type: 'session.update',
+      session: {
+        type: 'realtime',
+        instructions: 'You are a helpful voice assistant. Keep responses brief.',
+        output_modalities: ['audio', 'text'],
+        audio: {
+          input: {
+            format: { type: 'audio/pcm', rate: 24000 },
+            transcription: { model: 'universal-streaming' },
+            turn_detection: { type: 'server_vad', threshold: 0.5,
+                               prefix_padding_ms: 300, silence_duration_ms: 200 },
+          },
+          output: { format: { type: 'audio/pcm', rate: 24000 }, voice: 'sage' },
+        },
+      },
+    }));
+    sessionReady = true;
+  };
+
+  const pendingCalls = [];
+
+  ws.onmessage = (event) => {
+    const msg = JSON.parse(event.data);
+
+    // Fast path: audio deltas — decode and schedule immediately
+    if (msg.type === 'response.output_audio.delta') {
+      decodeAndPlay(msg.delta);
+      return;
+    }
+
+    if (msg.type === 'session.created') {
+      console.log('Connected — session', msg.session.id);
+      ws.send(JSON.stringify({ type: 'input_audio_buffer.clear' }));
+    }
+    else if (msg.type === 'input_audio_buffer.speech_started') {
+      console.log('User speaking');
+      stopPlayback(); // interrupt agent
+    }
+    else if (msg.type === 'response.output_audio_transcript.done') {
+      console.log('Agent:', msg.transcript);
+    }
+    else if (msg.type === 'conversation.item.input_audio_transcription.completed') {
+      console.log('You:  ', msg.transcript);
+    }
+    else if (msg.type === 'response.output_item.done') {
+      const item = msg.item;
+      if (item?.type === 'function_call') {
+        pendingCalls.push({ name: item.name, callId: item.call_id,
+                            arguments: JSON.parse(item.arguments || '{}') });
+      }
+    }
+    else if (msg.type === 'response.done') {
+      if (pendingCalls.length > 0) {
+        const calls = pendingCalls.splice(0);
+        for (const call of calls) {
+          const result = runTool(call.name, call.arguments);
+          ws.send(JSON.stringify({
+            type: 'conversation.item.create',
+            item: {
+              id: `item_${crypto.randomUUID().replace(/-/g, '').slice(0, 24)}`,
+              type: 'function_call_output',
+              call_id: call.callId,
+              output: JSON.stringify(result),
+            },
+          }));
+        }
+      }
+    }
+    else if (msg.type === 'error') {
+      console.error('Error:', msg.error);
+    }
+  };
+
+  // Start mic and forward audio
+  const mic = await startMic((base64Audio) => {
+    if (ws.readyState === WebSocket.OPEN && sessionReady) {
+      ws.send(JSON.stringify({ type: 'input_audio_buffer.append', audio: base64Audio }));
+    }
+  });
+
+  return { stop: () => { mic.stop(); ws.close(); } };
+}
+
+function runTool(name, args) {
+  if (name === 'get_weather') {
+    return { temperature: 72, condition: 'sunny', location: args.location };
+  }
+  return { error: `Unknown tool: ${name}` };
+}
+
+// Start on button click (required for AudioContext autoplay policy)
+document.getElementById('start').addEventListener('click', startAgent);
+```
+
+</Accordion>
+
+<Note>
+Browsers can't set custom headers on WebSocket connections. In production, connect through a proxy server that injects your API key. See the [proxy pattern](/docs/speech-to-text/universal-streaming/voice-agents/proxy) for a minimal FastAPI implementation.
+</Note>
 
-Direct WebSocket control — useful if you can't use the OpenAI SDK or need the flat session format.
+</Tab>
+
+<Tab title="Raw WebSocket (Python)">
 
 ```bash
 pip install websockets sounddevice
 ```
 
-```python
+Direct WebSocket control. Use this if you prefer the flat session format or can't use the OpenAI SDK.
+
+<Accordion title="Full example">
+
+```python agent_ws.py
 import asyncio, base64, json, queue, threading, time, uuid
 import sounddevice as sd
 import websockets
@@ -739,49 +1814,34 @@ def run_tool(name, args):
 
 class AudioPlayer:
     def __init__(self):
-        self._buf = bytearray()
-        self._lock = threading.Lock()
-        self._out = sd.RawOutputStream(
-            samplerate=SAMPLE_RATE, channels=1, dtype="int16",
-            blocksize=480, latency="low", callback=self._callback,
-        )
+        self._buf, self._lock = bytearray(), threading.Lock()
+        self._out = sd.RawOutputStream(samplerate=SAMPLE_RATE, channels=1, dtype="int16",
+            blocksize=480, latency="low", callback=self._callback)
         self._out.start()
 
-    def _callback(self, outdata, frames, time_info, status):
+    def _callback(self, outdata, frames, *_):
         n = frames * 2
         with self._lock:
-            chunk = bytes(self._buf[:n])
-            del self._buf[:n]
-        if len(chunk) < n:
-            chunk += b'\x00' * (n - len(chunk))
-        outdata[:] = chunk
+            chunk = bytes(self._buf[:n]); del self._buf[:n]
+        outdata[:] = chunk + b'\x00' * (n - len(chunk))
 
-    def play(self, pcm: bytes):
-        with self._lock:
-            self._buf.extend(pcm)
+    def play(self, pcm):
+        with self._lock: self._buf.extend(pcm)
 
     def clear(self):
-        with self._lock:
-            self._buf.clear()
+        with self._lock: self._buf.clear()
 
-    def close(self):
-        self._out.stop(); self._out.close()
+    def close(self): self._out.stop(); self._out.close()
 
 
 async def main():
-    player = AudioPlayer()
-    q = queue.Queue()
+    player, q = AudioPlayer(), queue.Queue()
 
-    def mic_cb(data, frames, ti, status):
-        q.put_nowait(bytes(data))
+    def mic_cb(data, frames, ti, status): q.put_nowait(bytes(data))
+    sd.RawInputStream(samplerate=SAMPLE_RATE, channels=1, dtype="int16",
+        blocksize=480, callback=mic_cb, latency="low").start()
 
-    mic = sd.RawInputStream(
-        samplerate=SAMPLE_RATE, channels=1, dtype="int16",
-        blocksize=480, callback=mic_cb, latency="low",
-    )
-    mic.start()
-
-    # websockets 13.x uses extra_headers, 14.x+ uses additional_headers
+    # websockets 13.x uses extra_headers; 14.x+ uses additional_headers
     try:
         ws = await websockets.connect(WS_URL, extra_headers={"Authorization": f"Bearer {API_KEY}"})
     except TypeError:
@@ -802,23 +1862,17 @@ async def main():
     }}))
 
     loop = asyncio.get_running_loop()
-
-    while not q.empty():
-        q.get_nowait()
+    while not q.empty(): q.get_nowait()
 
     async def stream_mic():
         while True:
             pcm = await loop.run_in_executor(None, q.get)
             buf = bytearray(pcm)
             while not q.empty():
-                try:
-                    buf.extend(q.get_nowait())
-                except Exception:
-                    break
-            await ws.send(json.dumps({
-                "type": "input_audio_buffer.append",
-                "audio": base64.b64encode(bytes(buf)).decode(),
-            }))
+                try: buf.extend(q.get_nowait())
+                except: break
+            await ws.send(json.dumps({"type": "input_audio_buffer.append",
+                "audio": base64.b64encode(bytes(buf)).decode()}))
 
     async def handle_events():
         pending_calls = []
@@ -828,58 +1882,37 @@ async def main():
             t = time.strftime("%H:%M:%S")
 
             if et == "session.created":
-                print(f"[{t}] Connected — session {e['session']['id']}")
+                print(f"[{t}] Connected — {e['session']['id']}")
                 await ws.send(json.dumps({"type": "input_audio_buffer.clear"}))
-
             elif et == "input_audio_buffer.speech_started":
-                print(f"[{t}] You started speaking")
-                player.clear()
-
-            elif et == "input_audio_buffer.speech_stopped":
-                print(f"[{t}] You stopped speaking")
-
+                print(f"[{t}] You speaking"); player.clear()
             elif et == "conversation.item.input_audio_transcription.completed":
-                txt = e.get("transcript", "")
-                if txt:
-                    print(f"[{t}] You:   {txt}")
-
+                if txt := e.get("transcript", ""): print(f"[{t}] You:   {txt}")
             elif et == "response.output_audio.delta":
                 player.play(base64.b64decode(e["delta"]))
-
             elif et == "response.output_audio_transcript.done":
                 print(f"[{t}] Agent: {e.get('transcript', '')}")
-
             elif et == "response.output_item.done":
                 item = e.get("item", {})
                 if item.get("type") == "function_call":
-                    call_id = item.get("call_id", "")
+                    call_id = item["call_id"]
                     if not any(c["call_id"] == call_id for c in pending_calls):
-                        pending_calls.append({
-                            "name": item.get("name", ""),
-                            "call_id": call_id,
-                            "arguments": json.loads(item.get("arguments", "{}")),
-                        })
-                        print(f"[{t}] Tool:  {item['name']}({item.get('arguments', '{}')})")
-
+                        pending_calls.append({"name": item["name"], "call_id": call_id,
+                                              "arguments": json.loads(item.get("arguments", "{}"))})
             elif et == "response.done":
                 s = e.get("response", {}).get("status", "?")
                 print(f"[{t}] Done ({s})")
                 if pending_calls:
-                    calls = pending_calls[:]
-                    pending_calls.clear()
+                    calls = pending_calls[:]; pending_calls.clear()
                     for call in calls:
                         result = run_tool(call["name"], call["arguments"])
-                        print(f"[{t}]   → {call['name']}: {json.dumps(result)[:80]}")
                         await ws.send(json.dumps({
                             "type": "conversation.item.create",
-                            "item": {
-                                "id": f"item_{uuid.uuid4().hex[:24]}",
-                                "type": "function_call_output",
-                                "call_id": call["call_id"],
-                                "output": json.dumps(result),
-                            },
+                            "item": {"id": f"item_{uuid.uuid4().hex[:24]}",
+                                     "type": "function_call_output",
+                                     "call_id": call["call_id"],
+                                     "output": json.dumps(result)},
                         }))
-
             elif et == "error":
                 print(f"[{t}] Error: {e.get('error', {})}")
 
@@ -889,22 +1922,26 @@ async def main():
     except KeyboardInterrupt:
         pass
     finally:
-        mic.stop(); mic.close(); player.close(); await ws.close()
+        player.close(); await ws.close()
 
 
 if __name__ == "__main__":
     asyncio.run(main())
 ```
 
-### LiveKit Agents
+</Accordion>
+
+</Tab>
 
-Uses the [LiveKit Agents framework](https://docs.livekit.io/agents/) with the OpenAI Realtime plugin. LiveKit handles audio transport, room management, and client connections — you define the agent behavior.
+<Tab title="LiveKit Agents">
 
 ```bash
 pip install "livekit-agents[openai,silero]" python-dotenv
 ```
 
-```python
+Uses the [LiveKit Agents framework](https://docs.livekit.io/agents/) for production deployments with WebRTC transport, room management, and client SDKs for web and mobile.
+
+```python agent.py
 import asyncio, os
 from dotenv import load_dotenv
 from livekit.agents import Agent, AgentServer, AgentSession, JobContext, JobProcess, RunContext, cli, function_tool
@@ -965,8 +2002,63 @@ if __name__ == "__main__":
     cli.run_app(server)
 ```
 
-Run with:
-
 ```bash
 python agent.py console
 ```
+
+</Tab>
+
+</Tabs>
+
+---
+
+## Events reference
+
+### Client → Server
+
+| Event | Description | Key fields |
+|-------|-------------|------------|
+| `session.update` | Configure the session | `session`: config object |
+| `input_audio_buffer.append` | Stream an audio chunk | `audio`: base64 PCM16 |
+| `input_audio_buffer.commit` | Commit buffered audio as a user turn | — |
+| `input_audio_buffer.clear` | Discard buffered audio | — |
+| `conversation.item.create` | Add a message or tool result | `item` |
+| `conversation.item.delete` | Remove a conversation item | `item_id` |
+| `response.create` | Trigger the agent to respond | — |
+| `response.cancel` | Cancel an in-progress response | — |
+
+### Server → Client
+
+| Event | Description | Key fields |
+|-------|-------------|------------|
+| `session.created` | Session initialized — clear buffer here | `session.id` |
+| `session.updated` | Config applied | `session` |
+| `input_audio_buffer.speech_started` | User started speaking | `audio_start_ms` |
+| `input_audio_buffer.speech_stopped` | User stopped speaking | `audio_end_ms` |
+| `input_audio_buffer.committed` | Audio committed as a turn | — |
+| `conversation.item.created` | New item added | `item` |
+| `conversation.item.input_audio_transcription.completed` | User speech transcribed | `transcript` |
+| `response.created` | Agent started generating | — |
+| `response.output_item.done` | Output item complete — capture tool calls here | `item` |
+| `response.output_audio.delta` | Agent audio chunk | `delta`: base64 PCM16 |
+| `response.output_audio.done` | Agent audio complete | — |
+| `response.output_audio_transcript.delta` | Agent text (streaming) | `delta` |
+| `response.output_audio_transcript.done` | Agent text (final) | `transcript` |
+| `response.done` | Response complete — inject tool results here | `response.status` |
+| `error` | Error occurred | `error.message` |
+
+---
+
+## What's next
+
+<CardGroup cols={3}>
+  <Card title="Universal Streaming STT" href="/docs/speech-to-text/universal-streaming">
+    Use AssemblyAI's STT on its own — real-time transcription with speaker diarization, word timestamps, and more.
+  </Card>
+  <Card title="LeMUR" href="/docs/lemur">
+    Apply LLMs to audio — summarize calls, extract action items, answer questions about recordings.
+  </Card>
+  <Card title="Audio Intelligence" href="/docs/audio-intelligence">
+    Sentiment analysis, topic detection, PII redaction, and more on top of transcription.
+  </Card>
+</CardGroup>

From 07d21b96923da275e8354adc3d8bfb2574fda051 Mon Sep 17 00:00:00 2001
From: dan-ince-aai <dince@assemblyai.com>
Date: Tue, 24 Feb 2026 13:40:10 +0000
Subject: [PATCH 6/7] fix

---
 .../voice-agents/speechtospeech.mdx           | 129 +++++-------------
 1 file changed, 31 insertions(+), 98 deletions(-)

diff --git a/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx b/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx
index 616771055..ae75c891f 100644
--- a/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx
+++ b/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx
@@ -996,113 +996,46 @@ Create `CLAUDE.md` at the root of your project with the context below. Claude Co
 ```markdown CLAUDE.md
 # AssemblyAI Speech-to-Speech API
 
-## Connection sequence (critical — order matters)
-1. Connect WebSocket to `wss://speech-to-speech.assemblyai.com/v1`
-2. Send `session.update` IMMEDIATELY — do NOT wait for `session.created`
-3. Start streaming mic audio right away (server needs audio to warm up)
-4. When `session.created` arrives: call `input_audio_buffer.clear()`, drain mic queue
-5. From here, fresh audio flows with config applied
-
-## OpenAI SDK (Python) session config
-```python
-await connection.session.update(session={
-    "type": "realtime",            # required for nested format
-    "instructions": "...",
-    "output_modalities": ["audio", "text"],
-    "audio": {
-        "input": {
-            "format": {"type": "audio/pcm", "rate": 24000},
-            "transcription": {"model": "universal-streaming"},
-            "turn_detection": {
-                "type": "server_vad",
-                "threshold": 0.5,      # raise to 0.6-0.7 if triggering on noise
-                "prefix_padding_ms": 300,
-                "silence_duration_ms": 200,  # raise to 400-500 if interrupting too early
-            },
-        },
-        "output": {"format": {"type": "audio/pcm", "rate": 24000}, "voice": "sage"},
-    },
-    "tools": [...],
-    "tool_choice": "auto",
-})
-```
-
-## Tool calling (capture → wait → inject)
-```python
-pending_calls = []
-
-# Capture on response.output_item.done (not response.function_call_arguments.done)
-if et == "response.output_item.done":
-    item = e.get("item", {})
-    if item.get("type") == "function_call":
-        pending_calls.append({"name": item["name"], "call_id": item["call_id"],
-                               "arguments": json.loads(item.get("arguments", "{}"))})
-
-# Inject on response.done — server auto-generates follow-up, no response.create needed
-elif et == "response.done":
-    for call in pending_calls:
-        result = run_tool(call["name"], call["arguments"])
-        await connection.conversation.item.create(item={
-            "id": f"item_{uuid.uuid4().hex[:24]}",  # id field is REQUIRED
-            "type": "function_call_output",
-            "call_id": call["call_id"],
-            "output": json.dumps(result),
-        })
-    pending_calls.clear()
-```
+## Connection sequence (order matters)
+1. Connect to wss://speech-to-speech.assemblyai.com/v1
+2. Send session.update IMMEDIATELY — do NOT wait for session.created
+3. Start streaming mic audio right away (server warms up on this audio)
+4. On session.created: call input_audio_buffer.clear(), drain mic queue
+5. Fresh audio flows from here with config applied
+
+## Session config (OpenAI SDK nested format)
+Include "type": "realtime" in the session object.
+turn_detection threshold: raise to 0.6-0.7 if triggering on noise.
+silence_duration_ms: raise to 400-500 if agent interrupts too early.
+
+## Tool calling pattern
+- Capture calls via response.output_item.done (item.type == "function_call")
+- Send results on response.done — server auto-generates follow-up, no response.create needed
+- Always include "id": f"item_{uuid.uuid4().hex[:24]}" in function_call_output items
+- Process all pending calls from one response together on response.done
 
 ## Stage-based prompting
-Update instructions mid-session to keep the agent focused at each phase:
-```python
-await connection.session.update(session={
-    "type": "realtime",
-    "instructions": new_stage_instructions,
-})
-```
-Do this:
-- After the opening exchange (greeting → main task)
-- Before injecting a tool result (so post-tool speech uses focused instructions)
-- When recovering from an error (reset back to earlier stage)
+Update instructions mid-session: await connection.session.update(session={"type": "realtime", "instructions": new_instructions})
+Do this: after greeting, before injecting tool results, when recovering from errors.
+Switch stage BEFORE injecting tool result — agent responds using instructions active at that moment.
 
 ## AudioPlayer — must be callback-based
-Never use blocking `write()` on the event loop — it starves mic sending and delays everything:
-```python
-class AudioPlayer:
-    def _callback(self, outdata, frames, *_):  # runs in audio thread — non-blocking
-        n = frames * 2
-        with self._lock:
-            chunk = bytes(self._buf[:n]); del self._buf[:n]
-        outdata[:] = chunk + b'\x00' * (n - len(chunk))
-
-    def play(self, pcm: bytes):  # returns instantly — just appends
-        with self._lock: self._buf.extend(pcm)
-
-    def clear(self):  # call on speech_started to stop agent audio immediately
-        with self._lock: self._buf.clear()
-```
+Use sd.RawOutputStream with callback= parameter. Never call blocking write() on the event loop.
+play() just appends to a buffer. clear() empties it (call on speech_started for interruption).
 
 ## Mic streaming
-Use `queue.Queue` (thread-safe) + `run_in_executor` to bridge mic to async:
-```python
-q = queue.Queue()
-# mic callback: q.put_nowait(bytes(data))
-
-async def stream_mic():
-    while True:
-        pcm = await loop.run_in_executor(None, q.get)  # non-blocking wait
-        await raw_ws.send(json.dumps({...}))
-```
-
-Use `raw_ws = connection._connection` and send via `raw_ws.send()` directly for audio
+Use queue.Queue (thread-safe, not asyncio.Queue).
+Use loop.run_in_executor(None, q.get) to await queue in async context without blocking.
+Use raw_ws = connection._connection and raw_ws.send() directly for audio sends
 — bypasses SDK Pydantic overhead on every 20ms chunk.
 
 ## Common pitfalls
-- ❌ Sending config on `session.created` — send it BEFORE, immediately after connect
-- ❌ Using `asyncio.Queue` in threads — it's not thread-safe, use `queue.Queue`
-- ❌ Blocking `write()` for audio playback — starves the event loop
-- ❌ Chaining tool calls across multiple responses — process all at once in `response.done`
-- ❌ Missing `id` field in `function_call_output` — breaks tool result association
-- ❌ Large chunks (100ms+) — adds VAD latency. Keep to ~20ms (480 samples at 24kHz)
+- Send config BEFORE session.created — not on it, not after
+- asyncio.Queue is not thread-safe — always use queue.Queue for mic
+- Blocking write() for audio starves the event loop — use callback-based RawOutputStream
+- Chain tool calls in one response.done batch, not across separate responses
+- Missing "id" field in function_call_output breaks tool result association
+- Chunks >20ms add VAD latency — use 480 samples (20ms) at 24kHz
 ```
 </Step>
 

From b310e890b7dcb9a7f746a7c43de00f86e76c5793 Mon Sep 17 00:00:00 2001
From: dan-ince-aai <dince@assemblyai.com>
Date: Tue, 24 Feb 2026 13:53:47 +0000
Subject: [PATCH 7/7] cleanup

---
 .../voice-agents/speechtospeech.mdx           | 55 +++++++++++++++----
 1 file changed, 44 insertions(+), 11 deletions(-)

diff --git a/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx b/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx
index ae75c891f..54c84fa77 100644
--- a/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx
+++ b/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx
@@ -1242,8 +1242,7 @@ Used when connecting with raw WebSocket.
       "type": "server_vad",
       "threshold": 0.5,
       "prefix_padding_ms": 300,
-      "silence_duration_ms": 200,
-      "create_response": true
+      "silence_duration_ms": 200
     },
     "tools": [],
     "tool_choice": "auto"
@@ -1288,8 +1287,7 @@ Used when connecting with raw WebSocket.
   "type": "server_vad",
   "threshold": 0.5,
   "prefix_padding_ms": 300,
-  "silence_duration_ms": 200,
-  "create_response": true
+  "silence_duration_ms": 200
 }
 ```
 
@@ -1305,10 +1303,6 @@ Used when connecting with raw WebSocket.
   Pause length before considering a turn complete. **Raise to 400–500ms** if the agent interrupts before the user has finished speaking.
 </ParamField>
 
-<ParamField path="create_response" type="boolean" default="true">
-  Auto-generate a response when the user finishes their turn.
-</ParamField>
-
 ### Voices
 
 | Voice | ID | Character |
@@ -1509,7 +1503,7 @@ if __name__ == "__main__":
 npm install openai
 ```
 
-Browser-based voice agent using Web Audio API for low-latency, gapless playback. Connect via your own proxy server (browsers can't set auth headers on WebSocket — see the proxy pattern below).
+Browser-based voice agent using Web Audio API for low-latency, gapless playback. Browsers can't set auth headers on WebSocket, so connect via a lightweight proxy that injects your API key:
 
 <Accordion title="Full browser example">
 
@@ -1703,7 +1697,47 @@ document.getElementById('start').addEventListener('click', startAgent);
 </Accordion>
 
 <Note>
-Browsers can't set custom headers on WebSocket connections. In production, connect through a proxy server that injects your API key. See the [proxy pattern](/docs/speech-to-text/universal-streaming/voice-agents/proxy) for a minimal FastAPI implementation.
+Browsers can't set custom headers on WebSocket connections. Run a minimal proxy that forwards the connection with your API key injected:
+
+```python proxy.py
+from fastapi import FastAPI, WebSocket
+from openai import AsyncOpenAI
+import asyncio, json
+
+app = FastAPI()
+client = AsyncOpenAI(api_key="YOUR_ASSEMBLYAI_API_KEY",
+                     websocket_base_url="wss://speech-to-speech.assemblyai.com/v1")
+
+@app.websocket("/api/s2s")
+async def proxy(websocket: WebSocket):
+    await websocket.accept()
+    conn = await client.realtime.connect(
+        model="universal-streaming",
+        websocket_connection_options={"compression": None},
+    ).enter()
+    raw_ws = conn._connection
+
+    async def from_client():
+        try:
+            while True:
+                await raw_ws.send(await websocket.receive_text())
+        except Exception: pass
+
+    async def from_server():
+        try:
+            while True:
+                await websocket.send_text((await conn.recv_bytes()).decode())
+        except Exception: pass
+
+    _, pending = await asyncio.wait(
+        [asyncio.create_task(from_client()), asyncio.create_task(from_server())],
+        return_when=asyncio.FIRST_COMPLETED,
+    )
+    for t in pending: t.cancel()
+    await conn.close()
+```
+
+Then connect the browser to `ws://localhost:8000/api/s2s`.
 </Note>
 
 </Tab>
@@ -1923,7 +1957,6 @@ async def entrypoint(ctx: JobContext):
                 threshold=0.5,
                 prefix_padding_ms=300,
                 silence_duration_ms=200,
-                create_response=True,
             ),
         )
     )