diff --git a/fern/assets/components/AgentGenerator.tsx b/fern/assets/components/AgentGenerator.tsx
new file mode 100644
index 000000000..1c09db552
--- /dev/null
+++ b/fern/assets/components/AgentGenerator.tsx
@@ -0,0 +1,453 @@
+"use client";
+import * as React from "react";
+
+const DOCS_URL =
+  "https://www.assemblyai.com/docs/speech-to-text/voice-agents/speechtospeech";
+
+type OutputFormat = "python" | "javascript" | "config";
+
+const LLM_CONTEXT = `You are an expert at building real-time voice agents using the AssemblyAI Speech-to-Speech API. Based on the user's description, generate a complete voice agent implementation.
+
+## AssemblyAI Speech-to-Speech API Reference
+
+Endpoint: wss://speech-to-speech.assemblyai.com/v1/realtime
+Auth: Authorization: Bearer YOUR_ASSEMBLYAI_API_KEY header on WebSocket connect
+Audio: PCM16 (signed 16-bit little-endian), mono, 24000 Hz, base64-encoded in JSON
+Voices: sage (default), ember, breeze, cascade
+
+### Session config (flat format, for raw WebSocket):
+{
+  "type": "session.update",
+  "session": {
+    "instructions": "System prompt here",
+    "voice": "sage",
+    "input_audio_format": "pcm16",
+    "input_audio_sample_rate": 24000,
+    "output_audio_format": "pcm16",
+    "output_audio_sample_rate": 24000,
+    "input_audio_transcription": {"model": "universal-streaming"},
+    "output_modalities": ["audio", "text"],
+    "turn_detection": {"type": "server_vad", "threshold": 0.5, "prefix_padding_ms": 300, "silence_duration_ms": 200},
+    "tools": [],
+    "tool_choice": "auto"
+  }
+}
+
+### Tool definition schema:
+{
+  "type": "function",
+  "name": "tool_name",
+  "description": "What this tool does",
+  "parameters": {
+    "type": "object",
+    "properties": {
+      "param_name": {"type": "string", "description": "Param description"}
+    },
+    "required": ["param_name"]
+  }
+}
+
+### Tool calling pattern:
+- On "response.function_call_arguments.done": Start executing the function immediately (use asyncio.create_task)
+- On "response.done": Send results back via "conversation.item.create" with type "function_call_output"
+- Do NOT send "response.create" after tool results — the server continues automatically
+- Interruptions are handled automatically by server-side VAD — no client logic needed
+
+### Key events:
+Client sends: session.update, input_audio_buffer.append (base64 audio)
+Server sends: session.created, input_audio_buffer.speech_started, input_audio_buffer.speech_stopped, conversation.item.input_audio_transcription.completed, response.output_audio.delta (base64 audio), response.output_audio_transcript.done, response.function_call_arguments.done, response.done
+
+### Python quickstart template (raw WebSocket with websockets + sounddevice):
+\`\`\`python
+import asyncio, base64, json, threading, time
+import sounddevice as sd
+import websockets
+
+API_KEY = "YOUR_ASSEMBLYAI_API_KEY"
+WS_URL = "wss://speech-to-speech.assemblyai.com/v1/realtime"
+SAMPLE_RATE = 24000
+
+TOOLS = []  # Add tool definitions here
+
+async def run_tool(name, args):
+    # Implement tool logic here
+    return {"error": f"Unknown tool: {name}"}
+
+class AudioPlayer:
+    def __init__(self):
+        self._buf = bytearray()
+        self._lock = threading.Lock()
+        self._out = sd.RawOutputStream(samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480, latency="low")
+        self._out.start()
+    def play(self, pcm):
+        with self._lock:
+            self._buf.extend(pcm)
+            while len(self._buf) >= 960:
+                self._out.write(bytes(self._buf[:960]))
+                del self._buf[:960]
+    def close(self):
+        self._out.stop(); self._out.close()
+
+async def main():
+    player = AudioPlayer()
+    q = asyncio.Queue()
+    def mic_cb(data, frames, ti, status):
+        q.put_nowait(bytes(data))
+    mic = sd.RawInputStream(samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480, callback=mic_cb, latency="low")
+    mic.start()
+
+    ws = await websockets.connect(WS_URL, additional_headers={"Authorization": f"Bearer {API_KEY}"})
+    await ws.send(json.dumps({"type": "session.update", "session": {
+        "input_audio_format": "pcm16", "input_audio_sample_rate": SAMPLE_RATE,
+        "output_audio_format": "pcm16", "output_audio_sample_rate": SAMPLE_RATE,
+        "input_audio_transcription": {"model": "universal-streaming"},
+        "turn_detection": {"type": "server_vad", "threshold": 0.5, "prefix_padding_ms": 300, "silence_duration_ms": 200},
+        "output_modalities": ["audio", "text"],
+        "instructions": "SYSTEM_PROMPT_HERE",
+        "voice": "sage",
+        "tools": TOOLS,
+        "tool_choice": "auto",
+    }}))
+    pending_tasks = {}
+    async def stream_mic():
+        while True:
+            try:
+                pcm = await asyncio.wait_for(q.get(), timeout=0.1)
+                await ws.send(json.dumps({"type": "input_audio_buffer.append", "audio": base64.b64encode(pcm).decode()}))
+            except asyncio.TimeoutError:
+                pass
+    async def handle_events():
+        async for raw in ws:
+            e = json.loads(raw)
+            et = e.get("type", "")
+            t = time.strftime("%H:%M:%S")
+            if et == "session.created":
+                print(f"[{t}] Connected — session {e['session']['id']}")
+            elif et == "conversation.item.input_audio_transcription.completed":
+                print(f"[{t}] You:   {e.get('transcript', '')}")
+            elif et == "response.output_audio.delta":
+                player.play(base64.b64decode(e["delta"]))
+            elif et == "response.output_audio_transcript.done":
+                print(f"[{t}] Agent: {e.get('transcript', '')}")
+            elif et == "response.function_call_arguments.done":
+                args = json.loads(e["arguments"])
+                pending_tasks[e["call_id"]] = asyncio.create_task(run_tool(e["name"], args))
+                print(f"[{t}] Tool:  {e['name']}({e['arguments']})")
+            elif et == "response.done":
+                if pending_tasks and e.get("response", {}).get("status") == "completed":
+                    for cid, task in pending_tasks.items():
+                        result = await task
+                        await ws.send(json.dumps({"type": "conversation.item.create", "item": {"type": "function_call_output", "call_id": cid, "output": json.dumps(result)}}))
+                    pending_tasks.clear()
+    print("Listening — start talking.\\n")
+    try:
+        await asyncio.gather(stream_mic(), handle_events())
+    except KeyboardInterrupt:
+        pass
+    finally:
+        mic.stop(); mic.close(); player.close(); await ws.close()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+\`\`\`
+
+### JavaScript quickstart template (raw WebSocket in Node.js):
+\`\`\`javascript
+// Requires: npm install ws
+const WebSocket = require("ws");
+const API_KEY = "YOUR_ASSEMBLYAI_API_KEY";
+const WS_URL = "wss://speech-to-speech.assemblyai.com/v1/realtime";
+
+const TOOLS = [];  // Add tool definitions here
+
+function runTool(name, args) {
+  // Implement tool logic here
+  return { error: "Unknown tool: " + name };
+}
+
+const ws = new WebSocket(WS_URL, { headers: { Authorization: "Bearer " + API_KEY } });
+const pendingTasks = new Map();
+
+ws.on("open", () => {
+  ws.send(JSON.stringify({ type: "session.update", session: {
+    input_audio_format: "pcm16", input_audio_sample_rate: 24000,
+    output_audio_format: "pcm16", output_audio_sample_rate: 24000,
+    input_audio_transcription: { model: "universal-streaming" },
+    turn_detection: { type: "server_vad", threshold: 0.5, prefix_padding_ms: 300, silence_duration_ms: 200 },
+    output_modalities: ["audio", "text"],
+    instructions: "SYSTEM_PROMPT_HERE",
+    voice: "sage",
+    tools: TOOLS,
+    tool_choice: "auto",
+  }}));
+  // Start streaming mic audio as base64 PCM16 via input_audio_buffer.append
+});
+
+ws.on("message", async (raw) => {
+  const e = JSON.parse(raw);
+  if (e.type === "response.output_audio.delta") {
+    // Play base64-decoded PCM16 audio: Buffer.from(e.delta, "base64")
+  } else if (e.type === "response.output_audio_transcript.done") {
+    console.log("Agent:", e.transcript);
+  } else if (e.type === "response.function_call_arguments.done") {
+    pendingTasks.set(e.call_id, runTool(e.name, JSON.parse(e.arguments)));
+  } else if (e.type === "response.done" && pendingTasks.size > 0) {
+    for (const [callId, resultPromise] of pendingTasks) {
+      const result = await resultPromise;
+      ws.send(JSON.stringify({ type: "conversation.item.create", item: { type: "function_call_output", call_id: callId, output: JSON.stringify(result) } }));
+    }
+    pendingTasks.clear();
+  }
+});
+\`\`\`
+
+Full documentation: ${DOCS_URL}`;
+
+const FORMAT_INSTRUCTIONS: Record<OutputFormat, string> = {
+  python: `Generate a COMPLETE, RUNNABLE Python script using the raw WebSocket template above. Include:
+1. A detailed system prompt in the instructions field tailored to the agent's purpose
+2. All tool definitions in the TOOLS array with proper JSON schemas
+3. Full run_tool() implementation with realistic mock data for each tool
+4. All imports, the AudioPlayer class, mic handling, and event loop — everything needed to pip install and run
+Choose an appropriate voice from: sage, ember, breeze, cascade.`,
+
+  javascript: `Generate a COMPLETE, RUNNABLE JavaScript/Node.js script using the JS WebSocket template above. Include:
+1. A detailed system prompt in the instructions field tailored to the agent's purpose
+2. All tool definitions in the TOOLS array with proper JSON schemas
+3. Full runTool() implementation with realistic mock data for each tool
+4. All requires, WebSocket setup, and event handling — everything needed to npm install and run
+Choose an appropriate voice from: sage, ember, breeze, cascade.
+For audio I/O, use a comment placeholder since Node.js audio libraries vary.`,
+
+  config: `Generate ONLY the session configuration JSON (the session.update payload) with:
+1. A detailed system prompt in the instructions field tailored to the agent's purpose
+2. All tool definitions in the tools array with proper JSON schemas
+3. An appropriate voice chosen from: sage, ember, breeze, cascade
+4. All audio format and turn detection settings filled in
+Output ONLY the JSON — no script wrapper.`,
+};
+
+const MAX_DESCRIPTION_CHARS = 2000;
+const MAX_URL_LENGTH = 8000;
+
+const truncateAtWordBoundary = (text: string, maxLength: number): string => {
+  if (text.length <= maxLength) return text;
+  const truncated = text.substring(0, maxLength);
+  const lastSpace = truncated.lastIndexOf(" ");
+  if (lastSpace > 50) return text.substring(0, lastSpace);
+  return truncated;
+};
+
+export function AgentGenerator() {
+  const [description, setDescription] = React.useState("");
+  const [format, setFormat] = React.useState<OutputFormat>("python");
+
+  const buildPrompt = (maxContentLength?: number) => {
+    let descText = description || "(No description provided — generate a general-purpose helpful voice assistant)";
+
+    if (descText.length > MAX_DESCRIPTION_CHARS) {
+      descText = truncateAtWordBoundary(descText, MAX_DESCRIPTION_CHARS) + "\n\n[Description truncated]";
+    }
+
+    if (maxContentLength && descText.length > maxContentLength) {
+      descText = truncateAtWordBoundary(descText, maxContentLength) + "\n\n[Description truncated]";
+    }
+
+    return `${LLM_CONTEXT}
+
+## Output format
+${FORMAT_INSTRUCTIONS[format]}
+
+## User's agent description
+${descText}`;
+  };
+
+  const getMaxContentLength = (baseUrl: string) => {
+    const promptWithoutDesc = buildPrompt(0).replace(
+      description || "(No description provided — generate a general-purpose helpful voice assistant)",
+      ""
+    );
+    const encodedBaseLength = baseUrl.length + encodeURIComponent(promptWithoutDesc).length;
+    const available = MAX_URL_LENGTH - encodedBaseLength;
+    return Math.floor(available / 3);
+  };
+
+  const openInClaude = () => {
+    const baseUrl = "https://claude.ai/new?q=";
+    const maxLen = getMaxContentLength(baseUrl);
+    const prompt = encodeURIComponent(buildPrompt(maxLen));
+    window.open(`${baseUrl}${prompt}`, "_blank");
+  };
+
+  const openInChatGPT = () => {
+    const baseUrl = "https://chat.openai.com/?q=";
+    const maxLen = getMaxContentLength(baseUrl);
+    const prompt = encodeURIComponent(buildPrompt(maxLen));
+    window.open(`${baseUrl}${prompt}`, "_blank");
+  };
+
+  const openInGemini = () => {
+    const baseUrl = "https://aistudio.google.com/prompts/new_chat?prompt=";
+    const maxLen = getMaxContentLength(baseUrl);
+    const prompt = encodeURIComponent(buildPrompt(maxLen));
+    window.open(`${baseUrl}${prompt}`, "_blank");
+  };
+
+  const containerStyle: React.CSSProperties = {
+    border: "1px solid var(--grayscale-a4, #e5e7eb)",
+    borderRadius: "8px",
+    padding: "24px",
+    backgroundColor: "var(--grayscale-2, #f9fafb)",
+  };
+
+  const labelStyle: React.CSSProperties = {
+    display: "block",
+    fontSize: "14px",
+    fontWeight: 500,
+    marginBottom: "8px",
+    color: "var(--grayscale-12, #111827)",
+  };
+
+  const textareaStyle: React.CSSProperties = {
+    width: "100%",
+    height: "120px",
+    padding: "12px",
+    border: "1px solid var(--grayscale-a4, #d1d5db)",
+    borderRadius: "6px",
+    fontSize: "14px",
+    fontFamily: "inherit",
+    resize: "vertical",
+    backgroundColor: "var(--grayscale-1, #ffffff)",
+    color: "var(--grayscale-12, #111827)",
+  };
+
+  const charCountStyle: React.CSSProperties = {
+    fontSize: "12px",
+    color: "var(--grayscale-11, #6b7280)",
+    marginTop: "4px",
+  };
+
+  const toggleContainerStyle: React.CSSProperties = {
+    display: "flex",
+    gap: "4px",
+    padding: "4px",
+    backgroundColor: "var(--grayscale-a3, #e5e7eb)",
+    borderRadius: "6px",
+    width: "fit-content",
+  };
+
+  const toggleButtonStyle = (active: boolean): React.CSSProperties => ({
+    padding: "6px 16px",
+    border: "none",
+    borderRadius: "4px",
+    fontSize: "13px",
+    fontWeight: 500,
+    cursor: "pointer",
+    backgroundColor: active ? "var(--grayscale-1, #ffffff)" : "transparent",
+    color: active ? "var(--grayscale-12, #111827)" : "var(--grayscale-11, #6b7280)",
+    boxShadow: active ? "0 1px 2px rgba(0,0,0,0.08)" : "none",
+    transition: "all 0.15s ease",
+  });
+
+  const buttonBaseStyle: React.CSSProperties = {
+    display: "inline-flex",
+    alignItems: "center",
+    gap: "8px",
+    padding: "10px 20px",
+    border: "none",
+    borderRadius: "6px",
+    fontSize: "14px",
+    fontWeight: 500,
+    cursor: "pointer",
+    color: "#ffffff",
+  };
+
+  const helpTextStyle: React.CSSProperties = {
+    marginTop: "12px",
+    fontSize: "13px",
+    color: "var(--grayscale-11, #6b7280)",
+  };
+
+  return (
+    <div style={containerStyle}>
+      <div style={{ display: "flex", flexDirection: "column", gap: "16px" }}>
+        <div>
+          <label style={labelStyle}>Describe your agent</label>
+          <textarea
+            value={description}
+            onChange={(e) => setDescription(e.target.value)}
+            placeholder="A customer service agent for a pizza delivery company. It can check order status by order number, estimate delivery time, process refunds, and answer questions about the menu. It should be friendly and concise."
+            style={textareaStyle}
+          />
+          <div style={charCountStyle}>
+            {description.length > 0 &&
+              `${description.length.toLocaleString()} / ${MAX_DESCRIPTION_CHARS.toLocaleString()} characters`}
+            {description.length > MAX_DESCRIPTION_CHARS && " — will be truncated"}
+          </div>
+        </div>
+
+        <div>
+          <label style={labelStyle}>Output format</label>
+          <div style={toggleContainerStyle}>
+            <button
+              onClick={() => setFormat("python")}
+              style={toggleButtonStyle(format === "python")}
+            >
+              Python
+            </button>
+            <button
+              onClick={() => setFormat("javascript")}
+              style={toggleButtonStyle(format === "javascript")}
+            >
+              JavaScript
+            </button>
+            <button
+              onClick={() => setFormat("config")}
+              style={toggleButtonStyle(format === "config")}
+            >
+              Config only
+            </button>
+          </div>
+        </div>
+
+        <div style={{ marginTop: "8px" }}>
+          <label style={labelStyle}>Generate with AI</label>
+          <div style={{ display: "flex", gap: "12px", flexWrap: "wrap" }}>
+            <button
+              onClick={openInClaude}
+              style={{ ...buttonBaseStyle, backgroundColor: "#d97706" }}
+            >
+              <svg width="16" height="16" viewBox="0 0 16 17" fill="none" xmlns="http://www.w3.org/2000/svg">
+                <path fillRule="evenodd" clipRule="evenodd" d="M9.218 2.52954H11.62L16 13.5162H13.598L9.218 2.52954ZM4.37933 2.52954H6.89067L11.2707 13.5162H8.82133L7.926 11.2089H3.34467L2.44867 13.5155H0L4.38 2.53087L4.37933 2.52954ZM7.134 9.16887L5.63533 5.30754L4.13667 9.16954H7.13333L7.134 9.16887Z" fill="currentColor"/>
+              </svg>
+              Open in Claude
+            </button>
+            <button
+              onClick={openInChatGPT}
+              style={{ ...buttonBaseStyle, backgroundColor: "#10a37f" }}
+            >
+              <svg width="16" height="16" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg">
+                <path d="M22.0606 9.86697C22.6034 8.23781 22.4165 6.45314 21.5485 4.97127C20.2431 2.69837 17.6188 1.52902 15.0558 2.0793C13.9156 0.794818 12.2774 0.0643507 10.5601 0.074818C7.94025 0.0688367 5.61576 1.75557 4.80978 4.24828C3.12679 4.59295 1.67408 5.64641 0.823986 7.13949C-0.491154 9.40641-0.191341 12.264 1.56567 14.2079C1.02286 15.8371 1.20978 17.6217 2.07782 19.1036C3.38324 21.3765 6.00754 22.5458 8.57053 21.9956C9.70997 23.2801 11.3488 24.0105 13.0662 23.9993C15.6875 24.006 18.0128 22.3178 18.8188 19.8229C20.5017 19.4782 21.9545 18.4247 22.8045 16.9316C24.1182 14.6647 23.8176 11.8094 22.0614 9.86547L22.0606 9.86697ZM13.0677 22.4359C12.0188 22.4374 11.0027 22.0703 10.1974 21.3982L10.3388 21.3182L15.1029 18.5668C15.3466 18.4285 15.4961 18.169 15.4946 17.8886V11.1724L17.5081 12.335C17.5298 12.3455 17.544 12.3664 17.547 12.3903V17.9522C17.544 20.4255 15.541 22.4307 13.0677 22.4359ZM3.43483 18.3215C2.90922 17.4139 2.72006 16.35 2.90025 15.3174L3.04156 15.4019L7.80567 18.1533C8.04716 18.2946 8.34623 18.2946 8.58847 18.1533L14.4045 14.7948V17.1201C14.406 17.144 14.3948 17.1672 14.3761 17.1821L9.56044 19.9627C7.41539 21.1978 4.67595 20.4636 3.43558 18.3215H3.43483ZM2.181 7.92229C2.70436 7.01314 3.53053 6.31781 4.51445 5.95669V6.12117L4.51221 11.6247C4.51072 11.9043 4.66025 12.1638 4.90324 12.3021L10.7193 15.6599L8.70586 16.8225C8.68567 16.8359 8.66025 16.8382 8.63782 16.8285L3.82137 14.0457C1.68081 12.806 0.946603 10.0673 2.18025 7.92304L2.181 7.92229ZM18.7238 11.772L12.9077 8.41351L14.9212 7.25164C14.9414 7.23818 14.9668 7.23594 14.9892 7.24566L19.8057 10.0262C21.95 11.2651 22.6849 14.0083 21.446 16.1526C20.9219 17.0602 20.0965 17.7556 19.1133 18.1174V12.4494C19.1156 12.1698 18.9668 11.9111 18.7245 11.772H18.7238ZM20.7275 8.75594L20.5862 8.67145L15.8221 5.92005C15.5806 5.77874 15.2816 5.77874 15.0393 5.92005L9.22324 9.27856V6.95332C9.22174 6.9294 9.23296 6.90622 9.25165 6.89127L14.0674 4.11295C16.2124 2.87557 18.9548 3.61201 20.1915 5.75781C20.7141 6.66398 20.9032 7.72491 20.726 8.75594H20.7275ZM8.12866 12.9002L6.11445 11.7376C6.09277 11.7272 6.07857 11.7062 6.07558 11.6823V6.12043C6.07707 3.64416 8.08604 1.63743 10.5623 1.63893C11.6098 1.63893 12.6236 2.00678 13.4288 2.67669L13.2875 2.75669L8.52343 5.50809C8.27969 5.64641 8.13016 5.9051 8.13165 6.18547L8.12866 12.8987V12.9002ZM9.22249 10.5421L11.8131 9.04603L14.4038 10.5414V13.5328L11.8131 15.0281L9.22249 13.5328V10.5421Z" fill="currentColor"/>
+              </svg>
+              Open in ChatGPT
+            </button>
+            <button
+              onClick={openInGemini}
+              style={{ ...buttonBaseStyle, backgroundColor: "#4285f4" }}
+            >
+              <svg width="16" height="16" viewBox="0 0 16 17" fill="none" xmlns="http://www.w3.org/2000/svg">
+                <path d="M8 16.5C8 12.634 11.134 9.5 15 9.5V8.5C11.134 8.5 8 5.366 8 1.5C8 5.366 4.866 8.5 1 8.5V9.5C4.866 9.5 8 12.634 8 16.5Z" fill="currentColor"/>
+              </svg>
+              Open in Gemini
+            </button>
+          </div>
+          <p style={helpTextStyle}>
+            Opens your preferred AI with your agent description and the full S2S
+            API reference pre-loaded. It will generate a complete agent with
+            system prompt, tool definitions, and runnable code.
+          </p>
+        </div>
+      </div>
+    </div>
+  );
+}
diff --git a/fern/pages/02-speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx b/fern/pages/02-speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx
new file mode 100644
index 000000000..595d9fd7b
--- /dev/null
+++ b/fern/pages/02-speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx
@@ -0,0 +1,781 @@
+---
+title: "Speech-to-Speech"
+description: "Build real-time voice agents with a single WebSocket connection. Stream audio in, get intelligent spoken responses back."
+---
+
+import { AgentGenerator } from "../../../../assets/components/AgentGenerator";
+
+Build voice agents with a single WebSocket connection. Stream audio in, get intelligent spoken responses back — with built-in transcription, turn detection, and function calling. The API is compatible with the [OpenAI Realtime API](https://platform.openai.com/docs/guides/realtime), so you can use the OpenAI SDK or any OpenAI-compatible framework like LiveKit.
+
+## Quickstart
+
+Install dependencies and talk to your agent in under a minute.
+
+```bash
+pip install websockets sounddevice
+```
+
+```python
+import asyncio, base64, json, threading
+import sounddevice as sd
+import websockets
+
+API_KEY = "YOUR_ASSEMBLYAI_API_KEY"
+WS_URL = "wss://speech-to-speech.assemblyai.com/v1/realtime"
+SAMPLE_RATE = 24000
+
+
+class AudioPlayer:
+    """Buffers and plays PCM16 audio in real time."""
+    def __init__(self):
+        self._buf = bytearray()
+        self._lock = threading.Lock()
+        self._out = sd.RawOutputStream(
+            samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480, latency="low",
+        )
+        self._out.start()
+
+    def play(self, pcm: bytes):
+        with self._lock:
+            self._buf.extend(pcm)
+            while len(self._buf) >= 960:
+                self._out.write(bytes(self._buf[:960]))
+                del self._buf[:960]
+
+    def close(self):
+        self._out.stop()
+        self._out.close()
+
+
+async def main():
+    player = AudioPlayer()
+    q = asyncio.Queue()
+
+    def mic_cb(data, frames, ti, status):
+        q.put_nowait(bytes(data))
+
+    mic = sd.RawInputStream(
+        samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480,
+        callback=mic_cb, latency="low",
+    )
+    mic.start()
+
+    ws = await websockets.connect(WS_URL, additional_headers={"Authorization": f"Bearer {API_KEY}"})
+
+    await ws.send(json.dumps({"type": "session.update", "session": {
+        "input_audio_format": "pcm16", "input_audio_sample_rate": SAMPLE_RATE,
+        "output_audio_format": "pcm16", "output_audio_sample_rate": SAMPLE_RATE,
+        "input_audio_transcription": {"model": "universal-streaming"},
+        "turn_detection": {"type": "server_vad", "threshold": 0.5, "prefix_padding_ms": 300, "silence_duration_ms": 200},
+        "output_modalities": ["audio", "text"],
+        "instructions": "You are a helpful voice assistant. Keep responses brief.",
+        "voice": "sage",
+    }}))
+
+    async def stream_mic():
+        while True:
+            try:
+                pcm = await asyncio.wait_for(q.get(), timeout=0.1)
+                await ws.send(json.dumps({"type": "input_audio_buffer.append", "audio": base64.b64encode(pcm).decode()}))
+            except asyncio.TimeoutError:
+                pass
+
+    async def handle_events():
+        async for raw in ws:
+            e = json.loads(raw)
+            et = e.get("type", "")
+            if et == "response.output_audio.delta":
+                player.play(base64.b64decode(e["delta"]))
+            elif et == "response.output_audio_transcript.done":
+                print(f"Agent: {e.get('transcript', '')}")
+            elif et == "conversation.item.input_audio_transcription.completed":
+                print(f"You:   {e.get('transcript', '')}")
+
+    print("Listening — start talking.\n")
+    try:
+        await asyncio.gather(stream_mic(), handle_events())
+    except KeyboardInterrupt:
+        pass
+    finally:
+        mic.stop(); mic.close(); player.close(); await ws.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+Replace `YOUR_ASSEMBLYAI_API_KEY` with your key from the [AssemblyAI dashboard](https://www.assemblyai.com/dashboard/signup), run the script, and start talking.
+
+---
+
+## How it works
+
+```
+Client                                     Server
+  |                                           |
+  |--- WebSocket connect -------------------->|
+  |--- session.update (config) -------------->|
+  |--- input_audio_buffer.append ------------>|  stream mic audio
+  |                                           |
+  |<------------ session.created -------------|
+  |<------------ speech_started --------------|  user is talking
+  |<------------ speech_stopped --------------|  user finished
+  |<------------ transcription.completed -----|  what the user said
+  |<------------ response.audio.delta --------|  agent speaks back
+  |<------------ response.done ---------------|
+  |                                           |
+```
+
+1. **Connect** — Open a WebSocket to `wss://speech-to-speech.assemblyai.com/v1/realtime` with your API key in the `Authorization: Bearer` header.
+2. **Configure** — Send a `session.update` with your voice, instructions, turn detection settings, and any tools.
+3. **Stream audio** — Send base64-encoded PCM16 audio chunks. The server detects when the user starts and stops speaking.
+4. **Receive responses** — The server transcribes the user's speech, generates a response, and streams back audio and text in real time.
+
+The API is fully compatible with the OpenAI Realtime protocol, so the [OpenAI Python SDK](https://github.com/openai/openai-python), [LiveKit Agents](https://docs.livekit.io/agents/), and any OpenAI-compatible client work out of the box — just point them at `wss://speech-to-speech.assemblyai.com/v1`.
+
+---
+
+## Agent generator
+
+Describe your agent and we'll generate the complete code — system prompt, tool definitions, and a runnable script.
+
+<AgentGenerator />
+
+---
+
+## Configuration
+
+Configure your session by sending a `session.update` event after connecting. The API accepts two session formats depending on your integration approach.
+
+### Flat format (Raw WebSocket)
+
+```json
+{
+  "type": "session.update",
+  "session": {
+    "instructions": "You are a helpful voice assistant.",
+    "voice": "sage",
+    "input_audio_format": "pcm16",
+    "input_audio_sample_rate": 24000,
+    "output_audio_format": "pcm16",
+    "output_audio_sample_rate": 24000,
+    "input_audio_transcription": {"model": "universal-streaming"},
+    "output_modalities": ["audio", "text"],
+    "turn_detection": {
+      "type": "server_vad",
+      "threshold": 0.5,
+      "prefix_padding_ms": 300,
+      "silence_duration_ms": 200,
+      "create_response": true
+    },
+    "tools": [],
+    "tool_choice": "auto"
+  }
+}
+```
+
+### Nested format (OpenAI SDK / LiveKit)
+
+The OpenAI GA SDK and LiveKit plugin use a nested session format.
+
+```json
+{
+  "type": "session.update",
+  "session": {
+    "instructions": "You are a helpful voice assistant.",
+    "output_modalities": ["audio", "text"],
+    "audio": {
+      "input": {
+        "format": {"type": "audio/pcm", "rate": 24000},
+        "transcription": {"model": "universal-streaming"},
+        "turn_detection": {
+          "type": "server_vad",
+          "threshold": 0.5,
+          "prefix_padding_ms": 300,
+          "silence_duration_ms": 200,
+          "create_response": true
+        }
+      },
+      "output": {
+        "format": {"type": "audio/pcm", "rate": 24000},
+        "voice": "sage"
+      }
+    },
+    "tools": [],
+    "tool_choice": "auto"
+  }
+}
+```
+
+### Session parameters
+
+<ParamField path="instructions" type="string">
+  System prompt for the AI agent. Defines personality, behavior, and constraints.
+</ParamField>
+
+<ParamField path="voice" type="string" default="sage">
+  Voice for agent audio responses. One of: `sage`, `ember`, `breeze`, `cascade`.
+</ParamField>
+
+<ParamField path="input_audio_format" type="string" default="pcm16">
+  Input audio encoding. Use `pcm16` (signed 16-bit little-endian).
+</ParamField>
+
+<ParamField path="input_audio_sample_rate" type="integer" default="24000">
+  Input audio sample rate in Hz.
+</ParamField>
+
+<ParamField path="output_audio_format" type="string" default="pcm16">
+  Output audio encoding. Use `pcm16` (signed 16-bit little-endian).
+</ParamField>
+
+<ParamField path="output_audio_sample_rate" type="integer" default="24000">
+  Output audio sample rate in Hz.
+</ParamField>
+
+<ParamField path="output_modalities" type="array">
+  What the agent returns. Include `"audio"` for spoken responses and `"text"` for transcripts.
+</ParamField>
+
+<ParamField path="input_audio_transcription" type="object">
+  Enables real-time transcription of user speech. Set `model` to `"universal-streaming"`.
+</ParamField>
+
+<ParamField path="turn_detection" type="object">
+  Server-side voice activity detection. See [Turn detection](#turn-detection).
+</ParamField>
+
+<ParamField path="tools" type="array" default="[]">
+  Functions the agent can call. See [Tool calling](#tool-calling).
+</ParamField>
+
+<ParamField path="tool_choice" type="string" default="auto">
+  When to use tools. `"auto"` lets the model decide.
+</ParamField>
+
+### Audio format
+
+All audio is **PCM16** (signed 16-bit integer, little-endian), **mono**, **24,000 Hz**. Audio is base64-encoded inside JSON messages. Each chunk should be approximately 20 ms (480 samples, 960 bytes).
+
+### Voices
+
+| Voice | ID |
+|-------|----|
+| Sage | `sage` |
+| Ember | `ember` |
+| Breeze | `breeze` |
+| Cascade | `cascade` |
+
+### Turn detection
+
+The server automatically detects when the user starts and stops speaking using voice activity detection (VAD). When the user finishes a turn, the agent responds automatically.
+
+```json
+"turn_detection": {
+  "type": "server_vad",
+  "threshold": 0.5,
+  "prefix_padding_ms": 300,
+  "silence_duration_ms": 200,
+  "create_response": true
+}
+```
+
+<ParamField path="type" type="string" required>
+  Set to `"server_vad"` for server-side voice activity detection.
+</ParamField>
+
+<ParamField path="threshold" type="float" default="0.5">
+  Speech detection sensitivity (0.0 to 1.0). Lower values detect quieter speech.
+</ParamField>
+
+<ParamField path="prefix_padding_ms" type="integer" default="300">
+  Audio to preserve before speech onset, in milliseconds. Prevents clipping the start of a sentence.
+</ParamField>
+
+<ParamField path="silence_duration_ms" type="integer" default="200">
+  How long the user must pause before the server considers them done speaking, in milliseconds.
+</ParamField>
+
+<ParamField path="create_response" type="boolean" default="true">
+  Automatically generate an agent response when the user finishes speaking.
+</ParamField>
+
+---
+
+## Tool calling
+
+Give your agent the ability to call functions in your application — look up data, take actions, or call external APIs — then continue the conversation with the result.
+
+### Define tools in your session config
+
+```json
+"tools": [{
+  "type": "function",
+  "name": "get_weather",
+  "description": "Get the current weather for a location",
+  "parameters": {
+    "type": "object",
+    "properties": {
+      "location": {"type": "string", "description": "City name"}
+    },
+    "required": ["location"]
+  }
+}],
+"tool_choice": "auto"
+```
+
+### Handle tool calls
+
+When the agent decides to call a function, the server sends `response.function_call_arguments.done` while the response is still in progress. Start executing the function immediately — you don't need to wait. When `response.done` arrives, send the result back.
+
+```python
+pending_tasks = {}
+
+async for raw in ws:
+    e = json.loads(raw)
+    et = e.get("type", "")
+
+    if et == "response.function_call_arguments.done":
+        # Start executing immediately — don't wait for response.done
+        args = json.loads(e["arguments"])
+        pending_tasks[e["call_id"]] = asyncio.create_task(run_tool(e["name"], args))
+
+    elif et == "response.done" and pending_tasks:
+        # Response is complete — send back the results
+        for call_id, task in pending_tasks.items():
+            result = await task
+            await ws.send(json.dumps({
+                "type": "conversation.item.create",
+                "item": {
+                    "type": "function_call_output",
+                    "call_id": call_id,
+                    "output": json.dumps(result),
+                },
+            }))
+        pending_tasks.clear()
+
+    elif et == "response.output_audio.delta":
+        player.play(base64.b64decode(e["delta"]))
+```
+
+The pattern is: **receive the call** → **start executing immediately** → **send the result when `response.done` arrives**. Your function runs concurrently while the response completes, so there's no wasted time.
+
+---
+
+## Events reference
+
+### Client → Server
+
+| Event | Description | Key fields |
+|-------|-------------|------------|
+| `session.update` | Configure the session | `session`: configuration object |
+| `input_audio_buffer.append` | Stream an audio chunk | `audio`: base64-encoded PCM16 |
+| `input_audio_buffer.commit` | Commit buffered audio as a user turn | — |
+| `input_audio_buffer.clear` | Discard buffered audio | — |
+| `conversation.item.create` | Add a message or tool result | `item`: conversation item |
+| `conversation.item.delete` | Remove a conversation item | `item_id`: ID to remove |
+| `response.create` | Trigger the agent to respond | — |
+
+### Server → Client
+
+| Event | Description | Key fields |
+|-------|-------------|------------|
+| `session.created` | Session initialized | `session.id` |
+| `input_audio_buffer.speech_started` | User started speaking | `audio_start_ms` |
+| `input_audio_buffer.speech_stopped` | User stopped speaking | `audio_end_ms` |
+| `input_audio_buffer.committed` | Audio committed as a turn | — |
+| `conversation.item.created` | New conversation item added | `item` |
+| `conversation.item.input_audio_transcription.completed` | User speech transcribed | `transcript` |
+| `response.created` | Agent started generating a response | — |
+| `response.output_audio.delta` | Agent audio chunk | `delta`: base64 PCM16 |
+| `response.output_audio.done` | Agent audio complete | — |
+| `response.output_audio_transcript.delta` | Agent text (streaming) | `delta` |
+| `response.output_audio_transcript.done` | Agent text (final) | `transcript` |
+| `response.function_call_arguments.done` | Agent requesting a tool call | `call_id`, `name`, `arguments` |
+| `response.done` | Response complete | `response.status`: `completed` or `cancelled` |
+| `error` | Error occurred | `error.message` |
+
+---
+
+## Complete examples
+
+Production-ready examples for three integration approaches. Each handles microphone input, speaker output, turn detection, transcription, and tool calling.
+
+### Raw WebSocket
+
+Direct WebSocket control using the `websockets` library.
+
+```bash
+pip install websockets sounddevice
+```
+
+```python
+import asyncio, base64, json, threading, time
+import sounddevice as sd
+import websockets
+
+API_KEY = "YOUR_ASSEMBLYAI_API_KEY"
+WS_URL = "wss://speech-to-speech.assemblyai.com/v1/realtime"
+SAMPLE_RATE = 24000
+
+TOOLS = [{
+    "type": "function",
+    "name": "get_weather",
+    "description": "Get the current weather for a location",
+    "parameters": {
+        "type": "object",
+        "properties": {"location": {"type": "string", "description": "City name"}},
+        "required": ["location"],
+    },
+}]
+
+
+async def run_tool(name, args):
+    """Replace with your own tool implementations."""
+    if name == "get_weather":
+        return {"temperature": 72, "condition": "sunny", "location": args["location"]}
+    return {"error": f"Unknown tool: {name}"}
+
+
+class AudioPlayer:
+    def __init__(self):
+        self._buf = bytearray()
+        self._lock = threading.Lock()
+        self._out = sd.RawOutputStream(
+            samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480, latency="low",
+        )
+        self._out.start()
+
+    def play(self, pcm: bytes):
+        with self._lock:
+            self._buf.extend(pcm)
+            while len(self._buf) >= 960:
+                self._out.write(bytes(self._buf[:960]))
+                del self._buf[:960]
+
+    def close(self):
+        self._out.stop()
+        self._out.close()
+
+
+async def main():
+    player = AudioPlayer()
+    q = asyncio.Queue()
+
+    def mic_cb(data, frames, ti, status):
+        q.put_nowait(bytes(data))
+
+    mic = sd.RawInputStream(
+        samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480,
+        callback=mic_cb, latency="low",
+    )
+    mic.start()
+
+    try:
+        ws = await websockets.connect(WS_URL, extra_headers={"Authorization": f"Bearer {API_KEY}"})
+    except TypeError:
+        ws = await websockets.connect(WS_URL, additional_headers={"Authorization": f"Bearer {API_KEY}"})
+
+    await ws.send(json.dumps({"type": "session.update", "session": {
+        "input_audio_format": "pcm16", "input_audio_sample_rate": SAMPLE_RATE,
+        "output_audio_format": "pcm16", "output_audio_sample_rate": SAMPLE_RATE,
+        "input_audio_transcription": {"model": "universal-streaming"},
+        "turn_detection": {"type": "server_vad", "threshold": 0.5, "prefix_padding_ms": 300, "silence_duration_ms": 200},
+        "output_modalities": ["audio", "text"],
+        "instructions": "You are a helpful voice assistant. Keep responses brief.",
+        "voice": "sage",
+        "tools": TOOLS,
+        "tool_choice": "auto",
+    }}))
+
+    pending_tasks = {}
+
+    async def stream_mic():
+        while True:
+            try:
+                pcm = await asyncio.wait_for(q.get(), timeout=0.1)
+                await ws.send(json.dumps({"type": "input_audio_buffer.append", "audio": base64.b64encode(pcm).decode()}))
+            except asyncio.TimeoutError:
+                pass
+
+    async def handle_events():
+        async for raw in ws:
+            e = json.loads(raw)
+            et = e.get("type", "")
+            t = time.strftime("%H:%M:%S")
+
+            if et == "session.created":
+                print(f"[{t}] Connected — session {e['session']['id']}")
+            elif et == "input_audio_buffer.speech_started":
+                print(f"[{t}] You started speaking")
+            elif et == "input_audio_buffer.speech_stopped":
+                print(f"[{t}] You stopped speaking")
+            elif et == "conversation.item.input_audio_transcription.completed":
+                print(f"[{t}] You:   {e.get('transcript', '')}")
+            elif et == "response.output_audio.delta":
+                player.play(base64.b64decode(e["delta"]))
+            elif et == "response.output_audio_transcript.done":
+                print(f"[{t}] Agent: {e.get('transcript', '')}")
+            elif et == "response.function_call_arguments.done":
+                args = json.loads(e["arguments"])
+                pending_tasks[e["call_id"]] = asyncio.create_task(run_tool(e["name"], args))
+                print(f"[{t}] Tool:  {e['name']}({e['arguments']})")
+            elif et == "response.done":
+                s = e.get("response", {}).get("status", "?")
+                print(f"[{t}] Done ({s})")
+                if pending_tasks and s == "completed":
+                    for cid, task in pending_tasks.items():
+                        result = await task
+                        await ws.send(json.dumps({
+                            "type": "conversation.item.create",
+                            "item": {"type": "function_call_output", "call_id": cid, "output": json.dumps(result)},
+                        }))
+                    pending_tasks.clear()
+
+    print("Listening — start talking.\n")
+    try:
+        await asyncio.gather(stream_mic(), handle_events())
+    except KeyboardInterrupt:
+        pass
+    finally:
+        mic.stop(); mic.close(); player.close(); await ws.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### OpenAI Python SDK
+
+Uses the OpenAI GA Realtime API. Note the differences from the beta API: `websocket_base_url` instead of `base_url`, `client.realtime.connect()` instead of `client.beta.realtime.connect()`, and the nested session format.
+
+```bash
+pip install openai sounddevice
+```
+
+```python
+import asyncio, base64, json, threading, time
+import sounddevice as sd
+from openai import AsyncOpenAI
+
+API_KEY = "YOUR_ASSEMBLYAI_API_KEY"
+SAMPLE_RATE = 24000
+
+client = AsyncOpenAI(
+    api_key=API_KEY,
+    websocket_base_url="wss://speech-to-speech.assemblyai.com/v1",
+)
+
+TOOLS = [{
+    "type": "function",
+    "name": "get_weather",
+    "description": "Get the current weather for a location",
+    "parameters": {
+        "type": "object",
+        "properties": {"location": {"type": "string", "description": "City name"}},
+        "required": ["location"],
+    },
+}]
+
+
+async def run_tool(name, args):
+    """Replace with your own tool implementations."""
+    if name == "get_weather":
+        return {"temperature": 72, "condition": "sunny", "location": args["location"]}
+    return {"error": f"Unknown tool: {name}"}
+
+
+class AudioPlayer:
+    def __init__(self):
+        self._buf = bytearray()
+        self._lock = threading.Lock()
+        self._out = sd.RawOutputStream(
+            samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480, latency="low",
+        )
+        self._out.start()
+
+    def play(self, pcm: bytes):
+        with self._lock:
+            self._buf.extend(pcm)
+            while len(self._buf) >= 960:
+                self._out.write(bytes(self._buf[:960]))
+                del self._buf[:960]
+
+    def close(self):
+        self._out.stop()
+        self._out.close()
+
+
+async def main():
+    player = AudioPlayer()
+    q = asyncio.Queue()
+
+    def mic_cb(data, frames, ti, status):
+        q.put_nowait(bytes(data))
+
+    mic = sd.RawInputStream(
+        samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480,
+        callback=mic_cb, latency="low",
+    )
+    mic.start()
+
+    connection = await client.realtime.connect(
+        model="universal-streaming",
+        websocket_connection_options={"compression": None},
+    ).enter()
+
+    print("Listening — start talking.\n")
+
+    async def send_config():
+        await connection.session.update(session={
+            "instructions": "You are a helpful voice assistant. Keep responses brief.",
+            "output_modalities": ["audio", "text"],
+            "audio": {
+                "input": {
+                    "format": {"type": "audio/pcm", "rate": 24000},
+                    "transcription": {"model": "universal-streaming"},
+                    "turn_detection": {
+                        "type": "server_vad", "threshold": 0.5,
+                        "prefix_padding_ms": 300, "silence_duration_ms": 200,
+                    },
+                },
+                "output": {
+                    "format": {"type": "audio/pcm", "rate": 24000},
+                    "voice": "sage",
+                },
+            },
+            "tools": TOOLS,
+            "tool_choice": "auto",
+        })
+
+    async def stream_mic():
+        while True:
+            pcm = await q.get()
+            await connection.input_audio_buffer.append(audio=base64.b64encode(pcm).decode())
+
+    async def handle_events():
+        pending_tasks = {}
+        while True:
+            data = await connection.recv_bytes()
+            e = json.loads(data.decode("utf-8"))
+            et = e.get("type", "")
+            t = time.strftime("%H:%M:%S")
+
+            if et == "session.created":
+                print(f"[{t}] Connected — session {e['session']['id']}")
+            elif et == "input_audio_buffer.speech_started":
+                print(f"[{t}] You started speaking")
+            elif et == "input_audio_buffer.speech_stopped":
+                print(f"[{t}] You stopped speaking")
+            elif et == "conversation.item.input_audio_transcription.completed":
+                txt = e.get("transcript", "")
+                if txt:
+                    print(f"[{t}] You:   {txt}")
+            elif et == "response.output_audio.delta":
+                player.play(base64.b64decode(e["delta"]))
+            elif et == "response.output_audio_transcript.done":
+                print(f"[{t}] Agent: {e.get('transcript', '')}")
+            elif et == "response.function_call_arguments.done":
+                args = json.loads(e["arguments"])
+                pending_tasks[e["call_id"]] = asyncio.create_task(run_tool(e["name"], args))
+                print(f"[{t}] Tool:  {e['name']}({e['arguments']})")
+            elif et == "response.done":
+                s = e.get("response", {}).get("status", "?")
+                print(f"[{t}] Done ({s})")
+                if pending_tasks and s == "completed":
+                    for cid, task in pending_tasks.items():
+                        result = await task
+                        await connection.conversation.item.create(item={
+                            "type": "function_call_output", "call_id": cid,
+                            "output": json.dumps(result)})
+                    pending_tasks.clear()
+            elif et == "error":
+                print(f"[{t}] Error: {e.get('error', {})}")
+
+    try:
+        await asyncio.gather(send_config(), stream_mic(), handle_events())
+    except KeyboardInterrupt:
+        pass
+    finally:
+        mic.stop(); mic.close(); player.close()
+        await connection.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### LiveKit Agents
+
+Uses the [LiveKit Agents framework](https://docs.livekit.io/agents/) with the OpenAI Realtime plugin. LiveKit handles audio transport, room management, and client connections — you define the agent behavior.
+
+```bash
+pip install "livekit-agents[openai,silero]" python-dotenv
+```
+
+```python
+import asyncio, os
+from dotenv import load_dotenv
+from livekit.agents import Agent, AgentServer, AgentSession, JobContext, JobProcess, RunContext, cli, function_tool
+from livekit.plugins import openai, silero
+from openai.types.beta.realtime.session import TurnDetection
+from openai.types.realtime import AudioTranscription
+
+load_dotenv()
+
+
+class VoiceAgent(Agent):
+    def __init__(self):
+        super().__init__(instructions="You are a helpful voice assistant. Keep responses brief.")
+
+    @function_tool
+    async def get_weather(self, context: RunContext, location: str):
+        """Get the current weather for a location.
+
+        Args:
+            location: City name
+        """
+        return f"72 degrees and sunny in {location}."
+
+
+server = AgentServer()
+
+
+def prewarm(proc: JobProcess):
+    proc.userdata["vad"] = silero.VAD.load()
+
+
+server.setup_fnc = prewarm
+
+
+@server.rtc_session()
+async def entrypoint(ctx: JobContext):
+    session = AgentSession(
+        llm=openai.realtime.RealtimeModel(
+            base_url="wss://speech-to-speech.assemblyai.com/v1",
+            api_key=os.environ["ASSEMBLYAI_API_KEY"],
+            model="universal-streaming",
+            voice="sage",
+            input_audio_transcription=AudioTranscription(model="universal-streaming"),
+            turn_detection=TurnDetection(
+                type="server_vad",
+                threshold=0.5,
+                prefix_padding_ms=300,
+                silence_duration_ms=200,
+                create_response=True,
+            ),
+        )
+    )
+    await session.start(agent=VoiceAgent(), room=ctx.room)
+    await ctx.connect()
+
+
+if __name__ == "__main__":
+    cli.run_app(server)
+```
+
+Run with:
+
+```bash
+python agent.py console
+```
diff --git a/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx b/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx
index c3c28c660..54c84fa77 100644
--- a/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx
+++ b/fern/pages/speech-to-text/universal-streaming/voice-agents/speechtospeech.mdx
@@ -1,467 +1,2030 @@
 ---
 title: "Speech-to-Speech"
-description: "Build real-time voice AI agents that listen and respond naturally"
+description: "The fastest speech-to-speech API. Build production voice agents with a single WebSocket — built-in VAD, transcription, and function calling included."
 ---
 
-Build voice-powered AI agents that have natural conversations with your users. Your agent listens to speech and responds with a natural-sounding voice—all in real-time.
+Stream audio in, get intelligent spoken responses back — with built-in turn detection, real-time transcription, and function calling. OpenAI Realtime-compatible, so your existing clients work out of the box.
+
+<CardGroup cols={3}>
+  <Card title="300/300 knowledge grounding" icon="bullseye">
+    Perfect score on the S2S benchmark — correctly grounds responses in provided knowledge every time
+  </Card>
+  <Card title="Beats GPT Realtime on instruction following" icon="check">
+    270/300 vs 260/300 — follows complex, multi-step instructions more reliably
+  </Card>
+  <Card title="OpenAI-compatible" icon="plug">
+    Works with any OpenAI Realtime client — just swap the endpoint
+  </Card>
+</CardGroup>
+
+| Model | Pass Rate | Tool Use | Instruction Following | Knowledge Grounding |
+|-------|-----------|----------|-----------------------|---------------------|
+| **AssemblyAI** | **90.0%** | 270/300 | 270/300 | **300/300** |
+| GPT Realtime | 86.7% | **271/300** | 260/300 | **300/300** |
+| Gemini Live | 86.0% | 258/300 | 261/300 | 293/300 |
+| Grok Realtime | — | 267/300 | **275/300** | 295/300 |
 
 <Note>
-  This is an early stage product subject to change and should not be used for
-  production usage.
+Benchmark data from [aiewf-eval](https://github.com/kwindla/aiewf-eval) — an open-source evaluation suite for speech-to-speech models covering tool use, instruction following, and knowledge grounding.
 </Note>
 
-## How it works
+---
+
+## Your agent in 30 seconds
+
+<Steps>
+
+<Step title="Install">
+```bash
+pip install openai sounddevice
+```
+</Step>
+
+<Step title="Get your API key">
+Grab your key from the [AssemblyAI dashboard](https://www.assemblyai.com/dashboard/signup).
+</Step>
+
+<Step title="Run this">
+```python agent.py
+import asyncio, base64, json, queue, threading
+import sounddevice as sd
+from openai import AsyncOpenAI
 
+client = AsyncOpenAI(
+    api_key="YOUR_ASSEMBLYAI_API_KEY",
+    websocket_base_url="wss://speech-to-speech.assemblyai.com/v1",
+)
+
+class AudioPlayer:
+    def __init__(self):
+        self._buf, self._lock = bytearray(), threading.Lock()
+        self._out = sd.RawOutputStream(samplerate=24000, channels=1,
+            dtype="int16", blocksize=480, latency="low", callback=self._cb)
+        self._out.start()
+    def _cb(self, out, frames, *_):
+        n = frames * 2
+        with self._lock:
+            chunk = bytes(self._buf[:n]); del self._buf[:n]
+        out[:] = chunk + b'\x00' * (n - len(chunk))
+    def play(self, pcm):
+        with self._lock: self._buf.extend(pcm)
+    def clear(self):
+        with self._lock: self._buf.clear()
+    def close(self): self._out.stop(); self._out.close()
+
+async def main():
+    player, q = AudioPlayer(), queue.Queue()
+    sd.RawInputStream(samplerate=24000, channels=1, dtype="int16",
+        blocksize=480, latency="low",
+        callback=lambda d,f,t,s: q.put_nowait(bytes(d))).start()
+
+    conn = await client.realtime.connect(
+        model="universal-streaming",
+        websocket_connection_options={"compression": None},
+    ).enter()
+
+    await conn.session.update(session={
+        "type": "realtime",
+        "instructions": "You are a helpful voice assistant. Keep responses brief.",
+        "output_modalities": ["audio", "text"],
+        "audio": {
+            "input": {"format": {"type": "audio/pcm", "rate": 24000},
+                      "transcription": {"model": "universal-streaming"},
+                      "turn_detection": {"type": "server_vad", "threshold": 0.5,
+                                         "prefix_padding_ms": 300, "silence_duration_ms": 200}},
+            "output": {"format": {"type": "audio/pcm", "rate": 24000}, "voice": "sage"},
+        },
+    })
+
+    raw_ws = conn._connection
+    loop = asyncio.get_running_loop()
+    while not q.empty(): q.get_nowait()  # drain stale mic audio
+
+    async def stream_mic():
+        while True:
+            pcm = await loop.run_in_executor(None, q.get)
+            await raw_ws.send(json.dumps({"type": "input_audio_buffer.append",
+                "audio": base64.b64encode(pcm).decode()}))
+
+    async def handle_events():
+        while True:
+            e = json.loads((await conn.recv_bytes()).decode())
+            et = e.get("type", "")
+            if et == "session.created":
+                await conn.input_audio_buffer.clear()
+            elif et == "input_audio_buffer.speech_started":
+                player.clear()
+            elif et == "response.output_audio.delta":
+                player.play(base64.b64decode(e["delta"]))
+            elif et == "response.output_audio_transcript.done":
+                print(f"Agent: {e.get('transcript', '')}")
+            elif et == "conversation.item.input_audio_transcription.completed":
+                print(f"You:   {e.get('transcript', '')}")
+
+    try:
+        await asyncio.gather(stream_mic(), handle_events())
+    except KeyboardInterrupt:
+        pass
+    finally:
+        player.close(); await conn.close()
+
+asyncio.run(main())
 ```
-┌─────────────┐                 ┌─────────────────┐                 ┌─────────────┐
-│             │     Audio       │                 │      Audio      │             │
-│    User     │  ────────────►  │   Voice Agent   │  ────────────►  │    User     │
-│  (speaks)   │                 │                 │                 │   (hears)   │
-└─────────────┘                 └─────────────────┘                 └─────────────┘
+</Step>
+
+<Step title="Start talking">
+```bash
+python agent.py
 ```
+You'll see transcripts in the terminal and hear the agent respond in real time.
+</Step>
 
-1. **User speaks** — Your app captures microphone audio and streams it to the agent
-2. **Agent responds** — The agent processes the speech and generates a spoken response
-3. **User hears** — Your app receives audio and plays it through the speaker
+</Steps>
 
-The entire flow happens in real-time with low latency.
+<Tip>
+That's the entire stack — mic capture, VAD, transcription, LLM, and TTS — in one script. Keep reading to build something real.
+</Tip>
 
 ---
 
-## Quick Start
+## What you can build
 
-Get a voice agent up and running in 3 steps.
+These aren't toy demos — they're production-quality voice agents you can copy, run, and iterate on in minutes.
 
-### Step 1: Get your API key
+<CardGroup cols={3}>
+  <Card title="🍕 Pizza order taker" href="#pizza-order-taker">
+    Takes a complete order through natural conversation. Handles toppings, sizes, delivery addresses, and specials. Calls a single tool at checkout — no mid-conversation tool spam. Stage-based prompting keeps the agent focused at each phase.
+  </Card>
+  <Card title="🔧 Auto service desk" href="#auto-service-desk">
+    Diagnoses car problems by asking smart follow-up questions ("does it happen all the time?", "any warning lights?"), suggests the right service, then books an appointment. The LLM's reasoning does the diagnostic work — no lookup tables needed.
+  </Card>
+  <Card title="🏥 ENT appointment setter" href="#ent-appointment-setter">
+    Routes callers to the right specialist based on their symptoms, checks doctor availability, and confirms all details before booking. Handles new vs. returning patients, allergy checks, and scheduling constraints.
+  </Card>
+</CardGroup>
 
-Grab your API key from your [AssemblyAI dashboard](https://www.assemblyai.com/app).
-
-### Step 2: Create your agent
+---
 
-Create an agent by sending a POST request. Here's an example of a friendly assistant:
+## Agent examples
 
 <Tabs>
-<Tab title="cURL">
-```bash
-curl -X POST https://aaigentsv1.up.railway.app/agents \
-  -H "Authorization: YOUR_API_KEY" \
-  -H "Content-Type: application/json" \
-  -d '{
-    "agent_name": "friendly_assistant",
-    "instructions": "You are a friendly and helpful assistant. Keep your responses concise and conversational. Be warm and personable.",
-    "voice": "luna",
-    "greeting": "Say hello and ask how you can help today."
-  }'
-```
-</Tab>
-<Tab title="Python">
-```python
-import requests
-
-response = requests.post(
-"https://aaigentsv1.up.railway.app/agents",
-headers={
-"Authorization": "YOUR_API_KEY",
-"Content-Type": "application/json"
-},
-json={
-"agent_name": "friendly_assistant",
-"instructions": "You are a friendly and helpful assistant. Keep your responses concise and conversational. Be warm and personable.",
-"voice": "luna",
-"greeting": "Say hello and ask how you can help today."
-}
-)
 
-print(response.json())
+<Tab title="🍕 Pizza order taker">
 
-````
-</Tab>
-<Tab title="JavaScript">
-```javascript
-const response = await fetch("https://aaigentsv1.up.railway.app/agents", {
-  method: "POST",
-  headers: {
-    "Authorization": "YOUR_API_KEY",
-    "Content-Type": "application/json"
-  },
-  body: JSON.stringify({
-    agent_name: "friendly_assistant",
-    instructions: "You are a friendly and helpful assistant. Keep your responses concise and conversational. Be warm and personable.",
-    voice: "luna",
-    greeting: "Say hello and ask how you can help today."
-  })
-});
-
-console.log(await response.json());
-````
+A full pizza ordering experience — the agent answers the phone, takes the order through conversation, and only calls a tool once when the customer confirms. No tool calls mid-order.
 
-</Tab>
-</Tabs>
+**Try saying:** *"Hi, I'd like a large pepperoni with extra cheese for delivery to 42 Main Street."*
 
-### Step 3: Start a conversation
+**What makes it interesting:** The `place_order` tool is called exactly once at the end with the entire order — items, delivery method, and address all gathered from conversation context. Stage-based prompting shifts the agent's instructions as the call progresses.
 
-Connect to your agent via WebSocket and start talking:
+```python Key snippets
+STAGE_INSTRUCTIONS = {
+    "greeting": """You work the phones at Sal's Pizza.
+Your opener is "Sal's Pizza, pickup or delivery?" — nothing more.""",
 
-```
-wss://aaigentsv1.up.railway.app/ws/friendly_assistant
+    "ordering": """Take items one by one. "Got it, what else?" after each.
+Remember everything from the conversation — no tools yet.
+When done: read back the order, ask "that sound right?"
+Only call place_order AFTER they confirm.""",
+
+    "placed": """Order is placed. Just say the total and ETA from the result.
+Do NOT repeat the items. Do NOT call more tools.""",
+}
+
+TOOLS = [{
+    "type": "function",
+    "name": "place_order",
+    "description": "Place the complete order. Call ONCE after the customer confirms.",
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "items": {
+                "type": "array",
+                "items": {"type": "object", "properties": {
+                    "type": {"type": "string", "description": "'pizza' or 'side' or 'drink'"},
+                    "size": {"type": "string"},
+                    "toppings": {"type": "array", "items": {"type": "string"}},
+                    "special": {"type": "string", "description": "Meat Lover's, Veggie Supreme, or BBQ Chicken"},
+                    "name": {"type": "string", "description": "For sides/drinks"},
+                    "quantity": {"type": "integer"},
+                }},
+            },
+            "method": {"type": "string", "description": "'delivery' or 'pickup'"},
+            "address": {"type": "string"},
+        },
+        "required": ["items", "method"],
+    },
+}]
 ```
 
-Once connected, send audio as binary WebSocket frames (PCM16, 16kHz, mono) and receive the agent's spoken responses back as audio.
+<Accordion title="Full runnable script">
 
-<Accordion title="Full Python example">
-```python
-import asyncio
-import json
-import websockets
+```python pizza_agent.py
+import asyncio, base64, datetime, json, queue, struct, threading, time, uuid
 import sounddevice as sd
-import numpy as np
+from openai import AsyncOpenAI
 
-async def voice_chat():
-uri = "wss://aaigentsv1.up.railway.app/ws/friendly_assistant"
-queue = asyncio.Queue(maxsize=100)
-session_ready = False
+API_KEY = "YOUR_ASSEMBLYAI_API_KEY"
+TARGET_RATE = 24000
 
-    async with websockets.connect(uri, ping_interval=10, ping_timeout=20) as ws:
-        print("Connected! Waiting for session...")
+client = AsyncOpenAI(
+    api_key=API_KEY,
+    websocket_base_url="wss://speech-to-speech.assemblyai.com/v1",
+)
 
-        # Send microphone audio to the agent
-        async def send_audio():
-            while True:
-                data = await queue.get()
-                if session_ready:
-                    await ws.send(data)
-                queue.task_done()
+MENU = {
+    "sizes": {"small": 8.99, "medium": 11.99, "large": 14.99, "extra-large": 17.99},
+    "toppings": {"pepperoni": 1.50, "sausage": 1.50, "bacon": 2.00, "ham": 1.50,
+                 "chicken": 2.00, "mushrooms": 1.00, "onions": 1.00, "peppers": 1.00,
+                 "olives": 1.00, "jalapeños": 1.00, "pineapple": 1.50, "extra cheese": 1.50},
+    "sides": {"garlic bread": 4.99, "wings (6pc)": 7.99, "wings (12pc)": 13.99, "caesar salad": 6.99},
+    "drinks": {"coke": 2.49, "diet coke": 2.49, "sprite": 2.49, "water": 1.99},
+    "specials": {
+        "Meat Lover's": {"toppings": ["pepperoni", "sausage", "bacon", "ham"], "discount": 2.00},
+        "Veggie Supreme": {"toppings": ["mushrooms", "onions", "peppers", "olives"], "discount": 2.50},
+        "BBQ Chicken": {"toppings": ["chicken", "onions", "bacon", "extra cheese"], "discount": 1.50},
+    },
+}
 
-        asyncio.create_task(send_audio())
-        loop = asyncio.get_running_loop()
+orders = {}
+
+
+def calc_order(items):
+    line_items = []
+    for item in items:
+        if item.get("type") == "pizza":
+            size = item.get("size", "large")
+            base = MENU["sizes"].get(size, 14.99)
+            toppings = item.get("toppings", [])
+            special = item.get("special")
+            if special and special in MENU["specials"]:
+                toppings = MENU["specials"][special]["toppings"]
+                discount = MENU["specials"][special]["discount"]
+            else:
+                discount = 0
+            price = round(max(base + sum(MENU["toppings"].get(t, 1.00) for t in toppings) - discount, 0), 2)
+            line_items.append({"description": f"{size} {special or ', '.join(toppings) or 'cheese'} pizza", "price": price})
+        else:
+            name = item.get("name", "")
+            qty = item.get("quantity", 1)
+            unit = MENU["sides"].get(name) or MENU["drinks"].get(name)
+            if unit:
+                line_items.append({"description": f"{qty}x {name}", "price": round(unit * qty, 2)})
+    return line_items, sum(i["price"] for i in line_items)
+
+
+def run_tool(name, args):
+    if name != "place_order":
+        return {"error": f"Unknown tool: {name}"}
+    items = args.get("items", [])
+    method = args.get("method", "pickup")
+    address = args.get("address", "")
+    if not items:
+        return {"error": "No items in order"}
+    if method == "delivery" and not address:
+        return {"error": "Need address for delivery"}
+    line_items, subtotal = calc_order(items)
+    delivery_fee = 3.99 if method == "delivery" else 0
+    tax = round((subtotal + delivery_fee) * 0.08, 2)
+    return {
+        "confirmed": True,
+        "total": f"${round(subtotal + delivery_fee + tax, 2):.2f}",
+        "estimated_time": "30-45 min" if method == "delivery" else "15-20 min",
+    }
 
-        def mic_callback(indata, frames, time, status):
-            if not queue.full():
-                loop.call_soon_threadsafe(queue.put_nowait, bytes(indata))
 
-        with sd.InputStream(samplerate=16000, channels=1, dtype='int16', callback=mic_callback), \
-             sd.OutputStream(samplerate=16000, channels=1, dtype='int16') as speaker:
+TOOLS = [{
+    "type": "function",
+    "name": "place_order",
+    "description": "Place the complete order. Call ONCE after the customer confirms.",
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "items": {
+                "type": "array",
+                "items": {"type": "object", "properties": {
+                    "type": {"type": "string"},
+                    "size": {"type": "string"},
+                    "toppings": {"type": "array", "items": {"type": "string"}},
+                    "special": {"type": "string"},
+                    "name": {"type": "string"},
+                    "quantity": {"type": "integer"},
+                }},
+            },
+            "method": {"type": "string", "description": "'delivery' or 'pickup'"},
+            "address": {"type": "string"},
+        },
+        "required": ["items", "method"],
+    },
+}]
 
-            while True:
-                response = await ws.recv()
+VOICE = """You ARE a real person. Max 1-2 sentences. "Got it", "cool", "yep" — not "Absolutely!" or "I'd be happy to!"
+NEVER say: certainly, absolutely, I'd be happy to, great question, fantastic."""
 
-                # Play audio responses
-                if isinstance(response, bytes) and len(response):
-                    speaker.write(np.frombuffer(response, dtype=np.int16))
+STAGE_INSTRUCTIONS = {
+    "greeting": VOICE + "\n\nYou work the phones at Sal's Pizza. Your opener: \"Sal's Pizza, pickup or delivery?\" Nothing more.",
+    "ordering": VOICE + f"""
 
-                # Handle JSON messages
-                elif isinstance(response, str):
-                    msg = json.loads(response)
+You're taking a pizza order at Sal's. Take items one by one. "Got it, what else?" after each.
+Remember everything from conversation — no tools yet. Default to regular crust.
+When done: "anything else?" once, then read back and ask "that sound right?"
+Only call place_order AFTER they confirm.
 
-                    if msg.get("type") == "session.created":
-                        print("Session ready! Start speaking...")
-                        session_ready = True
+Menu: small $8.99, medium $11.99, large $14.99, XL $17.99. Toppings $1-2 each.
+Specials: Meat Lover's ($2 off), Veggie Supreme ($2.50 off), BBQ Chicken ($1.50 off).
+Sides: garlic bread $4.99, wings 6pc $7.99/12pc $13.99. Drinks $2.49. Delivery: $3.99.""",
+    "placed": VOICE + "\n\nOrder placed. Just say total + ETA from the result. Do NOT repeat items. Do NOT call more tools.",
+}
 
-                    elif msg.get("type") == "conversation.item.done":
-                        item = msg.get("item", {})
-                        role = item.get("role")
-                        text = item.get("content", [{}])[0].get("text", "")
-                        print(f"[{role}]: {text}")
 
-asyncio.run(voice_chat())
+class AudioPlayer:
+    def __init__(self):
+        self._buf, self._lock = bytearray(), threading.Lock()
+        self._out = sd.RawOutputStream(samplerate=TARGET_RATE, channels=1, dtype="int16",
+            blocksize=480, latency="low", callback=self._callback)
+        self._out.start()
+
+    def _callback(self, outdata, frames, *_):
+        n = frames * 2
+        with self._lock:
+            chunk = bytes(self._buf[:n]); del self._buf[:n]
+        outdata[:] = chunk + b'\x00' * (n - len(chunk))
+
+    def play(self, pcm):
+        with self._lock: self._buf.extend(pcm)
+
+    def clear(self):
+        with self._lock: self._buf.clear()
+
+    def close(self): self._out.stop(); self._out.close()
+
+
+def resample(pcm: bytes, src: int) -> bytes:
+    if src == TARGET_RATE: return pcm
+    n = len(pcm) // 2
+    samps = struct.unpack(f"<{n}h", pcm)
+    r = src / TARGET_RATE
+    out = [int(samps[min(int(i*r), n-1)] + (samps[min(int(i*r)+1, n-1)] - samps[min(int(i*r), n-1)]) * (i*r - int(i*r)))
+           for i in range(int(n / r))]
+    return struct.pack(f"<{len(out)}h", *out)
+
+
+async def main():
+    player, q = AudioPlayer(), queue.Queue()
+    dev = sd.query_devices(kind="input")
+    native_rate = int(dev["default_samplerate"])
+    sd.RawInputStream(samplerate=native_rate, channels=1, dtype="int16",
+        blocksize=int(native_rate * 0.02), latency="low",
+        callback=lambda d,f,t,s: q.put_nowait(bytes(d))).start()
+
+    conn = await client.realtime.connect(
+        model="universal-streaming",
+        websocket_connection_options={"compression": None},
+    ).enter()
+
+    await conn.session.update(session={
+        "type": "realtime",
+        "instructions": STAGE_INSTRUCTIONS["greeting"],
+        "output_modalities": ["audio", "text"],
+        "audio": {
+            "input": {"format": {"type": "audio/pcm", "rate": TARGET_RATE},
+                      "transcription": {"model": "universal-streaming"},
+                      "turn_detection": {"type": "server_vad", "threshold": 0.6,
+                                         "prefix_padding_ms": 300, "silence_duration_ms": 500}},
+            "output": {"format": {"type": "audio/pcm", "rate": TARGET_RATE}, "voice": "sage"},
+        },
+        "tools": TOOLS, "tool_choice": "auto",
+    })
+
+    raw_ws = conn._connection
+    loop = asyncio.get_running_loop()
+    while not q.empty(): q.get_nowait()
+
+    async def set_stage(name):
+        await conn.session.update(session={"type": "realtime", "instructions": STAGE_INSTRUCTIONS[name]})
+
+    stage = "greeting"
+    user_turns = 0
+
+    async def stream_mic():
+        while True:
+            pcm = await loop.run_in_executor(None, q.get)
+            buf = bytearray(pcm)
+            while not q.empty():
+                try: buf.extend(q.get_nowait())
+                except: break
+            await raw_ws.send(json.dumps({"type": "input_audio_buffer.append",
+                "audio": base64.b64encode(resample(bytes(buf), native_rate)).decode()}))
+
+    async def handle_events():
+        nonlocal stage, user_turns
+        pending_call = None
+        while True:
+            e = json.loads((await conn.recv_bytes()).decode())
+            et = e.get("type", "")
+            if et == "response.output_audio.delta":
+                player.play(base64.b64decode(e["delta"])); continue
+            t = time.strftime("%H:%M:%S")
+            if et == "session.created":
+                await conn.input_audio_buffer.clear()
+            elif et == "input_audio_buffer.speech_started":
+                player.clear()
+                user_turns += 1
+                if stage == "greeting" and user_turns >= 2:
+                    stage = "ordering"; await set_stage("ordering")
+            elif et == "response.output_audio_transcript.done":
+                print(f"[{t}] Sal's: {e.get('transcript', '')}")
+            elif et == "response.output_item.done":
+                item = e.get("item", {})
+                if item.get("type") == "function_call" and not pending_call:
+                    pending_call = {"call_id": item["call_id"], "arguments": json.loads(item.get("arguments", "{}"))}
+            elif et == "response.done":
+                if pending_call:
+                    stage = "placed"; await set_stage("placed")
+                    result = run_tool("place_order", pending_call["arguments"])
+                    print(f"[{t}] 🧾 {result.get('total', '?')} — {result.get('estimated_time', '?')}")
+                    await conn.conversation.item.create(item={
+                        "id": f"item_{uuid.uuid4().hex[:24]}", "type": "function_call_output",
+                        "call_id": pending_call["call_id"], "output": json.dumps(result),
+                    })
+                    pending_call = None
+                elif stage == "greeting" and e.get("response", {}).get("status") == "completed" and user_turns >= 1:
+                    stage = "ordering"; await set_stage("ordering")
+            elif et == "error":
+                print(f"[{t}] Error: {e.get('error', {})}")
+
+    print("\n🍕 Sal's Pizza — Order Line\n")
+    try:
+        await asyncio.gather(stream_mic(), handle_events())
+    except KeyboardInterrupt:
+        pass
+    finally:
+        player.close(); await conn.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
 
-````
+</Accordion>
 
-Install dependencies with:
-```bash
-pip install websockets sounddevice numpy
-````
+</Tab>
 
-</Accordion>
+<Tab title="🔧 Auto service desk">
 
-That's it! You now have a working voice agent.
+A mechanic's service advisor that diagnoses car problems through conversation before booking. No lookup tables — the LLM does the reasoning.
 
----
+**Try saying:** *"My brakes are making a grinding noise when I slow down."*
 
-## Example agents
+**What makes it interesting:** The agent asks smart diagnostic follow-ups ("does it happen all the time?", "any warning lights?", "what's the mileage?") and maps symptoms to the right service. One tool call at the end with everything.
 
-Here are some practical examples to inspire your own agents.
+```python Key snippets
+STAGES = {
+    "greeting": """Answer: "Mike's Auto, this is the service desk, how can I help?"
+Find out what's going on. Don't use tools.""",
 
-### Customer support agent
+    "diagnosing": """Figure out the problem. Ask one question at a time:
+- "When did it start?"
+- "Does it happen all the time or sometimes?"
+- "Any warning lights?"
+- "Roughly how many miles?"
+You know cars — grinding brakes = worn pads. Check engine light = diagnostic.
+Once you have a clear picture, confirm details and only then call book_service.""",
 
-```json
-{
-  "agent_name": "support_agent",
-  "instructions": "You are a customer support agent for a software company. Be helpful, patient, and empathetic. Ask clarifying questions to understand the customer's issue. If you can't solve a problem, offer to escalate to a human agent. Keep responses brief and focused.",
-  "voice": "celeste",
-  "greeting": "Thank the customer for calling and ask how you can help them today."
+    "booked": """Appointment booked. Just give them the date and time.
+Mention the lockbox if they're dropping off early. Say bye. No more tools.""",
 }
+
+# Symptom → service mappings the agent knows:
+# Grinding/squealing when braking → brake inspection/replacement
+# Check engine light → engine diagnostic ($95-125)
+# Won't start / slow crank → battery test
+# AC not cold → AC service ($125-250)
+# Pulling to one side → alignment check
+# Vibration at speed → tire balance
 ```
 
-### Appointment scheduler
+<Accordion title="Full runnable script">
 
-```json
-{
-  "agent_name": "appointment_scheduler",
-  "instructions": "You are a friendly receptionist who helps schedule appointments. Collect the caller's name, preferred date and time, and reason for the appointment. Confirm all details before ending the call. Be efficient but warm.",
-  "voice": "estelle",
-  "greeting": "Welcome the caller and ask if they'd like to schedule an appointment."
-}
-```
+```python mechanic_agent.py
+import asyncio, base64, datetime, json, queue, struct, threading, time, uuid
+import sounddevice as sd
+from openai import AsyncOpenAI
 
-### Virtual concierge
+API_KEY = "YOUR_ASSEMBLYAI_API_KEY"
+TARGET_RATE = 24000
 
-```json
-{
-  "agent_name": "hotel_concierge",
-  "instructions": "You are a luxury hotel concierge. Be warm, professional, and knowledgeable. Help guests with restaurant recommendations, local attractions, transportation, and any requests. Anticipate needs and offer personalized suggestions.",
-  "voice": "orion",
-  "greeting": "Welcome the guest and ask how you can make their stay more enjoyable."
+client = AsyncOpenAI(
+    api_key=API_KEY,
+    websocket_base_url="wss://speech-to-speech.assemblyai.com/v1",
+)
+
+TODAY = datetime.date.today()
+bookings = {}
+
+
+def get_available_slots(date_str):
+    booked = {b["time"] for b in bookings.values() if b["date"] == date_str}
+    return [f"{h:02d}:{m:02d}" for h in range(8, 17) for m in [0, 30] if f"{h:02d}:{m:02d}" not in booked]
+
+
+def run_tool(name, args):
+    if name != "book_service": return {"error": f"Unknown: {name}"}
+    customer = args.get("customer_name", "")
+    vehicle = args.get("vehicle", "")
+    issue = args.get("issue_description", "")
+    date = args.get("date", "")
+    time_slot = args.get("time", "")
+    if not all([customer, vehicle, issue, date, time_slot]):
+        return {"error": "Missing: customer_name, vehicle, issue_description, date, time"}
+    try: d = datetime.date.fromisoformat(date)
+    except ValueError: return {"error": f"Bad date: {date}"}
+    if d < TODAY: return {"error": "Can't book in the past"}
+    if d.weekday() >= 6: return {"error": "Closed Sundays"}
+    available = get_available_slots(date)
+    if time_slot not in available:
+        return {"error": f"{time_slot} taken. Open: {', '.join(available[:4])}"}
+    booking_id = f"MA-{uuid.uuid4().hex[:6].upper()}"
+    bookings[booking_id] = {"id": booking_id, "customer": customer, "vehicle": vehicle,
+                             "issue": issue, "date": date, "time": time_slot,
+                             "drop_off": args.get("drop_off", False)}
+    return {"confirmed": True, "booking_id": booking_id, "date": d.strftime("%A %B %d"),
+            "time": time_slot, "drop_off_note": "Lockbox by the front door for early drop-off." if args.get("drop_off") else None}
+
+
+TOOLS = [{
+    "type": "function",
+    "name": "book_service",
+    "description": "Book a service appointment. Call ONCE after confirming all details.",
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "customer_name": {"type": "string"},
+            "phone": {"type": "string"},
+            "vehicle": {"type": "string", "description": "Year make model (e.g. '2019 Honda Civic')"},
+            "mileage": {"type": "string"},
+            "issue_description": {"type": "string"},
+            "suggested_service": {"type": "string"},
+            "date": {"type": "string", "description": "YYYY-MM-DD"},
+            "time": {"type": "string", "description": "HH:MM (24h)"},
+            "drop_off": {"type": "boolean"},
+        },
+        "required": ["customer_name", "vehicle", "issue_description", "date", "time"],
+    },
+}]
+
+VOICE = """You ARE a real person. You're the service advisor at Mike's Auto. Friendly, knows cars. Max 1-2 sentences.
+"Yeah", "got it", "mm-hmm" — not "Absolutely!" or "I'd be happy to!"
+NEVER say: certainly, absolutely, I'd be happy to, great question, fantastic."""
+
+STAGES = {
+    "greeting": VOICE + f"\n\nAnswer: \"Mike's Auto, service desk, how can I help?\"\nFind out what's wrong. Don't use tools. Today: {TODAY.strftime('%A, %B %d, %Y')}. Open Mon-Sat 8-5.",
+    "diagnosing": VOICE + f"""
+
+Figure out the problem. Ask one follow-up at a time — react to answers naturally.
+Ask: when it started, if it's constant or intermittent, warning lights, mileage, what kind of car.
+Based on symptoms, suggest what it might be and what service is needed.
+You know cars: grinding brakes = worn pads; check engine = diagnostic; won't start = battery; AC not cold = recharge.
+Once you have everything, confirm: "[name], [vehicle], [issue], [date] at [time]. That right?"
+Only call book_service AFTER they confirm. Today: {TODAY.strftime('%A, %B %d, %Y')}.""",
+    "booked": VOICE + "\n\nAppointment booked. Give date + time. Mention lockbox for early drop-off. Ask if anything else. Say bye. No more tools.",
 }
+
+
+class AudioPlayer:
+    def __init__(self):
+        self._buf, self._lock = bytearray(), threading.Lock()
+        self._out = sd.RawOutputStream(samplerate=TARGET_RATE, channels=1, dtype="int16",
+            blocksize=480, latency="low", callback=self._cb)
+        self._out.start()
+
+    def _cb(self, out, frames, *_):
+        n = frames * 2
+        with self._lock:
+            chunk = bytes(self._buf[:n]); del self._buf[:n]
+        out[:] = chunk + b'\x00' * (n - len(chunk))
+
+    def play(self, pcm):
+        with self._lock: self._buf.extend(pcm)
+
+    def clear(self):
+        with self._lock: self._buf.clear()
+
+    def close(self): self._out.stop(); self._out.close()
+
+
+def resample(pcm, src):
+    if src == TARGET_RATE: return pcm
+    n = len(pcm) // 2; samps = struct.unpack(f"<{n}h", pcm); r = src / TARGET_RATE
+    out = [int(samps[min(int(i*r), n-1)] + (samps[min(int(i*r)+1, n-1)] - samps[min(int(i*r), n-1)]) * (i*r - int(i*r)))
+           for i in range(int(n / r))]
+    return struct.pack(f"<{len(out)}h", *out)
+
+
+async def main():
+    player, q = AudioPlayer(), queue.Queue()
+    dev = sd.query_devices(kind="input")
+    native_rate = int(dev["default_samplerate"])
+    sd.RawInputStream(samplerate=native_rate, channels=1, dtype="int16",
+        blocksize=int(native_rate * 0.02), latency="low",
+        callback=lambda d,f,t,s: q.put_nowait(bytes(d))).start()
+
+    conn = await client.realtime.connect(
+        model="universal-streaming", websocket_connection_options={"compression": None},
+    ).enter()
+    await conn.session.update(session={
+        "type": "realtime", "instructions": STAGES["greeting"],
+        "output_modalities": ["audio", "text"],
+        "audio": {
+            "input": {"format": {"type": "audio/pcm", "rate": TARGET_RATE},
+                      "transcription": {"model": "universal-streaming"},
+                      "turn_detection": {"type": "server_vad", "threshold": 0.6,
+                                         "prefix_padding_ms": 300, "silence_duration_ms": 500}},
+            "output": {"format": {"type": "audio/pcm", "rate": TARGET_RATE}, "voice": "sage"},
+        },
+        "tools": TOOLS, "tool_choice": "auto",
+    })
+
+    raw_ws = conn._connection
+    loop = asyncio.get_running_loop()
+    while not q.empty(): q.get_nowait()
+
+    async def set_stage(name):
+        await conn.session.update(session={"type": "realtime", "instructions": STAGES[name]})
+
+    stage = "greeting"
+    user_turns = 0
+
+    async def stream_mic():
+        while True:
+            pcm = await loop.run_in_executor(None, q.get)
+            buf = bytearray(pcm)
+            while not q.empty():
+                try: buf.extend(q.get_nowait())
+                except: break
+            await raw_ws.send(json.dumps({"type": "input_audio_buffer.append",
+                "audio": base64.b64encode(resample(bytes(buf), native_rate)).decode()}))
+
+    async def handle_events():
+        nonlocal stage, user_turns
+        pending = None
+        while True:
+            e = json.loads((await conn.recv_bytes()).decode())
+            et = e.get("type", "")
+            if et == "response.output_audio.delta":
+                player.play(base64.b64decode(e["delta"])); continue
+            t = time.strftime("%H:%M:%S")
+            if et == "session.created": await conn.input_audio_buffer.clear()
+            elif et == "input_audio_buffer.speech_started":
+                player.clear(); user_turns += 1
+                if stage == "greeting" and user_turns >= 2:
+                    stage = "diagnosing"; await set_stage("diagnosing")
+            elif et == "response.output_audio_transcript.done":
+                print(f"[{t}] Mike's: {e.get('transcript', '')}")
+            elif et == "response.output_item.done":
+                item = e.get("item", {})
+                if item.get("type") == "function_call" and not pending:
+                    pending = {"call_id": item["call_id"], "arguments": json.loads(item.get("arguments", "{}"))}
+                    print(f"[{t}] 🔧 booking...")
+            elif et == "response.done":
+                if pending:
+                    stage = "booked"; await set_stage("booked")
+                    result = run_tool("book_service", pending["arguments"])
+                    if result.get("confirmed"): print(f"[{t}] ✅ {result['date']} at {result['time']}")
+                    else:
+                        print(f"[{t}] ❌ {result.get('error')}"); stage = "diagnosing"; await set_stage("diagnosing")
+                    await conn.conversation.item.create(item={
+                        "id": f"item_{uuid.uuid4().hex[:24]}", "type": "function_call_output",
+                        "call_id": pending["call_id"], "output": json.dumps(result),
+                    })
+                    pending = None
+                elif stage == "greeting" and e.get("response", {}).get("status") == "completed" and user_turns >= 1:
+                    stage = "diagnosing"; await set_stage("diagnosing")
+            elif et == "error": print(f"[{t}] Error: {e.get('error', {})}")
+
+    print(f"\n🔧 Mike's Auto — Service Desk\nToday: {TODAY.strftime('%A, %B %d, %Y')}\n")
+    try:
+        await asyncio.gather(stream_mic(), handle_events())
+    except KeyboardInterrupt:
+        pass
+    finally:
+        player.close(); await conn.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
 ```
 
----
+</Accordion>
+
+</Tab>
 
-## Choose a voice
+<Tab title="🏥 ENT appointment setter">
 
-Pick a voice that matches your agent's personality.
+Routes callers to the right specialist based on their symptoms, finds an available slot, and books the appointment. Handles new vs. returning patients and scheduling constraints.
 
-| Voice       | Style                               |
-| ----------- | ----------------------------------- |
-| `luna`      | Chill but excitable, gen-z optimist |
-| `celeste`   | Warm, laid-back, fun-loving         |
-| `orion`     | Older male, warm and happy          |
-| `ursa`      | Young male, energetic               |
-| `astra`     | Young female, wide-eyed and curious |
-| `esther`    | Older female, loving and caring     |
-| `estelle`   | Middle-aged female, sweet and kind  |
-| `andromeda` | Young female, breathy and calm      |
+**Try saying:** *"Hi, I've been having really bad sinus pressure for a few weeks."*
 
----
+**What makes it interesting:** The agent recommends the right doctor based on described symptoms (sinus → Dr. Okafor, child → Dr. Sharma, lump → Dr. Liu), gathers patient info conversationally, and only books after confirmation. Falls back gracefully on scheduling errors.
 
-## Add tools
+```python Key snippets
+PROVIDERS = {
+    "Dr. Sarah Chen":    {"specialty": "General ENT",         "days": ["Monday","Tuesday","Wednesday","Thursday","Friday"]},
+    "Dr. Michael Okafor":{"specialty": "Sinus & Allergy",     "days": ["Monday","Wednesday","Friday"]},
+    "Dr. Priya Sharma":  {"specialty": "Pediatric ENT",       "days": ["Tuesday","Thursday"]},
+    "Dr. James Liu":     {"specialty": "Head & Neck Surgery", "days": ["Monday","Tuesday","Thursday"]},
+}
 
-Tools let your agent take actions—like checking a database, calling an API, or triggering a workflow.
+# Routing logic lives in the prompt — no hard-coded rules:
+# "Sinus issues, allergies, congestion → Dr. Okafor"
+# "Kids → Dr. Sharma"
+# "Lumps, growths, post-surgical → Dr. Liu"
+# "Everything else → Dr. Chen (Mon-Fri)"
+```
 
-Here's a simple example of an agent with a weather tool:
+<Accordion title="Full runnable script">
 
-```json
-{
-  "agent_name": "weather_assistant",
-  "instructions": "You help users check the weather. When they ask about weather, use the get_weather tool to look it up.",
-  "voice": "luna",
-  "tools": [
-    {
-      "name": "get_weather",
-      "description": "Get the current weather for a city",
-      "parameters": {
+```python ent_agent.py
+import asyncio, base64, datetime, json, queue, struct, threading, time, uuid
+import sounddevice as sd
+from openai import AsyncOpenAI
+
+API_KEY = "YOUR_ASSEMBLYAI_API_KEY"
+TARGET_RATE = 24000
+
+client = AsyncOpenAI(
+    api_key=API_KEY,
+    websocket_base_url="wss://speech-to-speech.assemblyai.com/v1",
+)
+
+TODAY = datetime.date.today()
+PROVIDERS = {
+    "Dr. Sarah Chen":    {"specialty": "General ENT",         "days": ["Monday","Tuesday","Wednesday","Thursday","Friday"]},
+    "Dr. Michael Okafor":{"specialty": "Sinus & Allergy",     "days": ["Monday","Wednesday","Friday"]},
+    "Dr. Priya Sharma":  {"specialty": "Pediatric ENT",       "days": ["Tuesday","Thursday"]},
+    "Dr. James Liu":     {"specialty": "Head & Neck Surgery", "days": ["Monday","Tuesday","Thursday"]},
+}
+appointments = {}
+
+
+def get_slots(provider, date):
+    booked = {a["time"] for a in appointments.values() if a["provider"] == provider and a["date"] == date}
+    return [f"{h:02d}:{m:02d}" for h in range(8, 17) for m in [0, 30] if f"{h:02d}:{m:02d}" not in booked]
+
+
+def run_tool(name, args):
+    if name != "book_appointment": return {"error": f"Unknown: {name}"}
+    patient = args.get("patient_name", "")
+    provider = args.get("provider", "")
+    date = args.get("date", "")
+    slot = args.get("time", "")
+    reason = args.get("reason", "")
+    if not all([patient, provider, date, slot, reason]):
+        return {"error": "Missing: patient_name, provider, date, time, reason"}
+    if provider not in PROVIDERS: return {"error": f"Unknown provider. Use: {list(PROVIDERS.keys())}"}
+    try: d = datetime.date.fromisoformat(date)
+    except: return {"error": f"Bad date: {date}"}
+    day = d.strftime("%A")
+    if day not in PROVIDERS[provider]["days"]:
+        return {"error": f"{provider} doesn't work {day}s. Available: {', '.join(PROVIDERS[provider]['days'])}"}
+    if d < TODAY: return {"error": "Can't book in the past"}
+    available = get_slots(provider, date)
+    if slot not in available: return {"error": f"{slot} not available. Open: {', '.join(available[:5])}"}
+    apt_id = f"ENT-{uuid.uuid4().hex[:6].upper()}"
+    appointments[apt_id] = {"id": apt_id, "patient": patient, "provider": provider,
+                             "date": date, "time": slot, "reason": reason,
+                             "new_patient": args.get("new_patient", False)}
+    return {"confirmed": True, "appointment_id": apt_id,
+            "summary": f"{patient} with {provider} on {d.strftime('%A %B %d')} at {slot}",
+            "arrive_early": args.get("new_patient", False)}
+
+
+TOOLS = [{
+    "type": "function",
+    "name": "book_appointment",
+    "description": "Book an ENT appointment. Call ONCE after confirming all details.",
+    "parameters": {
         "type": "object",
         "properties": {
-          "city": {
-            "type": "string",
-            "description": "The city name"
-          }
+            "patient_name": {"type": "string"},
+            "provider": {"type": "string", "description": "Doctor's full name"},
+            "date": {"type": "string", "description": "YYYY-MM-DD"},
+            "time": {"type": "string", "description": "HH:MM (24h)"},
+            "reason": {"type": "string"},
+            "phone": {"type": "string"},
+            "new_patient": {"type": "boolean"},
         },
-        "required": ["city"]
-      }
-    }
-  ]
+        "required": ["patient_name", "provider", "date", "time", "reason"],
+    },
+}]
+
+VOICE = """You ARE a real person. Warm but efficient — you're busy. Max 1-2 sentences.
+"Sure thing", "got it", "mm-hmm" — not "Absolutely!" or "I'd be happy to!"
+NEVER say: certainly, absolutely, I'd be happy to, great question, fantastic."""
+
+STAGES = {
+    "greeting": VOICE + f"\n\nYou're the receptionist at Riverside ENT Associates. Answer: \"Riverside ENT, how can I help you?\"\nFind out what they need. Ask if new or existing. Don't use tools. Today: {TODAY.strftime('%A, %B %d, %Y')}.",
+    "scheduling": VOICE + f"""
+
+You're scheduling at Riverside ENT. Gather through conversation: patient name, what they need seen for, preferred date/time, callback number.
+Suggest the right doctor based on their symptoms. Don't use tools yet.
+
+Today: {TODAY.strftime('%A, %B %d, %Y')}.
+Doctors:
+- Dr. Sarah Chen — General ENT (Mon-Fri). Good default.
+- Dr. Michael Okafor — Sinus & Allergy (Mon/Wed/Fri). Sinus, allergies, congestion.
+- Dr. Priya Sharma — Pediatric ENT (Tue/Thu). Kids.
+- Dr. James Liu — Head & Neck Surgery (Mon/Tue/Thu). Lumps, growths, post-surgical.
+
+Once you have everything: "[name] with [doctor] on [date] at [time] for [reason]. Sound right?"
+Only call book_appointment AFTER they confirm.""",
+    "booked": VOICE + "\n\nAppointment booked. Tell them the confirmation (date, time, doctor).\nIf new patient, mention arriving 15 min early for paperwork. Ask if anything else. Say bye. No more tools.",
 }
+
+
+class AudioPlayer:
+    def __init__(self):
+        self._buf, self._lock = bytearray(), threading.Lock()
+        self._out = sd.RawOutputStream(samplerate=TARGET_RATE, channels=1, dtype="int16",
+            blocksize=480, latency="low", callback=self._cb)
+        self._out.start()
+
+    def _cb(self, out, frames, *_):
+        n = frames * 2
+        with self._lock:
+            chunk = bytes(self._buf[:n]); del self._buf[:n]
+        out[:] = chunk + b'\x00' * (n - len(chunk))
+
+    def play(self, pcm):
+        with self._lock: self._buf.extend(pcm)
+
+    def clear(self):
+        with self._lock: self._buf.clear()
+
+    def close(self): self._out.stop(); self._out.close()
+
+
+def resample(pcm, src):
+    if src == TARGET_RATE: return pcm
+    n = len(pcm) // 2; samps = struct.unpack(f"<{n}h", pcm); r = src / TARGET_RATE
+    out = [int(samps[min(int(i*r), n-1)] + (samps[min(int(i*r)+1, n-1)] - samps[min(int(i*r), n-1)]) * (i*r - int(i*r)))
+           for i in range(int(n / r))]
+    return struct.pack(f"<{len(out)}h", *out)
+
+
+async def main():
+    player, q = AudioPlayer(), queue.Queue()
+    dev = sd.query_devices(kind="input")
+    native_rate = int(dev["default_samplerate"])
+    sd.RawInputStream(samplerate=native_rate, channels=1, dtype="int16",
+        blocksize=int(native_rate * 0.02), latency="low",
+        callback=lambda d,f,t,s: q.put_nowait(bytes(d))).start()
+
+    conn = await client.realtime.connect(
+        model="universal-streaming", websocket_connection_options={"compression": None},
+    ).enter()
+    await conn.session.update(session={
+        "type": "realtime", "instructions": STAGES["greeting"],
+        "output_modalities": ["audio", "text"],
+        "audio": {
+            "input": {"format": {"type": "audio/pcm", "rate": TARGET_RATE},
+                      "transcription": {"model": "universal-streaming"},
+                      "turn_detection": {"type": "server_vad", "threshold": 0.6,
+                                         "prefix_padding_ms": 300, "silence_duration_ms": 500}},
+            "output": {"format": {"type": "audio/pcm", "rate": TARGET_RATE}, "voice": "sage"},
+        },
+        "tools": TOOLS, "tool_choice": "auto",
+    })
+
+    raw_ws = conn._connection
+    loop = asyncio.get_running_loop()
+    while not q.empty(): q.get_nowait()
+
+    async def set_stage(name):
+        await conn.session.update(session={"type": "realtime", "instructions": STAGES[name]})
+
+    stage = "greeting"
+    user_turns = 0
+
+    async def stream_mic():
+        while True:
+            pcm = await loop.run_in_executor(None, q.get)
+            buf = bytearray(pcm)
+            while not q.empty():
+                try: buf.extend(q.get_nowait())
+                except: break
+            await raw_ws.send(json.dumps({"type": "input_audio_buffer.append",
+                "audio": base64.b64encode(resample(bytes(buf), native_rate)).decode()}))
+
+    async def handle_events():
+        nonlocal stage, user_turns
+        pending = None
+        while True:
+            e = json.loads((await conn.recv_bytes()).decode())
+            et = e.get("type", "")
+            if et == "response.output_audio.delta":
+                player.play(base64.b64decode(e["delta"])); continue
+            t = time.strftime("%H:%M:%S")
+            if et == "session.created": await conn.input_audio_buffer.clear()
+            elif et == "input_audio_buffer.speech_started":
+                player.clear(); user_turns += 1
+                if stage == "greeting" and user_turns >= 2:
+                    stage = "scheduling"; await set_stage("scheduling")
+            elif et == "response.output_audio_transcript.done":
+                print(f"[{t}] Riverside: {e.get('transcript', '')}")
+            elif et == "response.output_item.done":
+                item = e.get("item", {})
+                if item.get("type") == "function_call" and not pending:
+                    pending = {"call_id": item["call_id"], "arguments": json.loads(item.get("arguments", "{}"))}
+                    print(f"[{t}] 🔧 booking...")
+            elif et == "response.done":
+                if pending:
+                    stage = "booked"; await set_stage("booked")
+                    result = run_tool("book_appointment", pending["arguments"])
+                    if result.get("confirmed"): print(f"[{t}] ✅ {result['summary']}")
+                    else:
+                        print(f"[{t}] ❌ {result.get('error')}"); stage = "scheduling"; await set_stage("scheduling")
+                    await conn.conversation.item.create(item={
+                        "id": f"item_{uuid.uuid4().hex[:24]}", "type": "function_call_output",
+                        "call_id": pending["call_id"], "output": json.dumps(result),
+                    })
+                    pending = None
+                elif stage == "greeting" and e.get("response", {}).get("status") == "completed" and user_turns >= 1:
+                    stage = "scheduling"; await set_stage("scheduling")
+            elif et == "error": print(f"[{t}] Error: {e.get('error', {})}")
+
+    print(f"\n🏥 Riverside ENT Associates\nToday: {TODAY.strftime('%A, %B %d, %Y')}\n")
+    try:
+        await asyncio.gather(stream_mic(), handle_events())
+    except KeyboardInterrupt:
+        pass
+    finally:
+        player.close(); await conn.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
 ```
 
-When a user asks "What's the weather in Tokyo?", the agent sends your client a `tool.call` event:
+</Accordion>
 
-```json
-{
-  "type": "tool.call",
-  "call_id": "call_abc123",
-  "name": "get_weather",
-  "arguments": { "city": "Tokyo" }
-}
+</Tab>
+
+</Tabs>
+
+---
+
+## Iterate fast with Claude Code
+
+The best way to build and tune voice agents is with an AI coding assistant that already understands the API. Drop a `CLAUDE.md` into your project and [Claude Code](https://claude.ai/code) will handle the boilerplate so you can focus on the agent behavior.
+
+<Steps>
+
+<Step title="Install Claude Code">
+```bash
+npm install -g @anthropic-ai/claude-code
+```
+</Step>
+
+<Step title="Start from an example above">
+Copy one of the agent scripts above into your project. Running it first means Claude Code has a concrete starting point to build on.
+</Step>
+
+<Step title="Drop in a CLAUDE.md">
+Create `CLAUDE.md` at the root of your project with the context below. Claude Code reads this automatically and will know exactly how the API works — no copy-pasting docs into every prompt.
+
+```markdown CLAUDE.md
+# AssemblyAI Speech-to-Speech API
+
+## Connection sequence (order matters)
+1. Connect to wss://speech-to-speech.assemblyai.com/v1
+2. Send session.update IMMEDIATELY — do NOT wait for session.created
+3. Start streaming mic audio right away (server warms up on this audio)
+4. On session.created: call input_audio_buffer.clear(), drain mic queue
+5. Fresh audio flows from here with config applied
+
+## Session config (OpenAI SDK nested format)
+Include "type": "realtime" in the session object.
+turn_detection threshold: raise to 0.6-0.7 if triggering on noise.
+silence_duration_ms: raise to 400-500 if agent interrupts too early.
+
+## Tool calling pattern
+- Capture calls via response.output_item.done (item.type == "function_call")
+- Send results on response.done — server auto-generates follow-up, no response.create needed
+- Always include "id": f"item_{uuid.uuid4().hex[:24]}" in function_call_output items
+- Process all pending calls from one response together on response.done
+
+## Stage-based prompting
+Update instructions mid-session: await connection.session.update(session={"type": "realtime", "instructions": new_instructions})
+Do this: after greeting, before injecting tool results, when recovering from errors.
+Switch stage BEFORE injecting tool result — agent responds using instructions active at that moment.
+
+## AudioPlayer — must be callback-based
+Use sd.RawOutputStream with callback= parameter. Never call blocking write() on the event loop.
+play() just appends to a buffer. clear() empties it (call on speech_started for interruption).
+
+## Mic streaming
+Use queue.Queue (thread-safe, not asyncio.Queue).
+Use loop.run_in_executor(None, q.get) to await queue in async context without blocking.
+Use raw_ws = connection._connection and raw_ws.send() directly for audio sends
+— bypasses SDK Pydantic overhead on every 20ms chunk.
+
+## Common pitfalls
+- Send config BEFORE session.created — not on it, not after
+- asyncio.Queue is not thread-safe — always use queue.Queue for mic
+- Blocking write() for audio starves the event loop — use callback-based RawOutputStream
+- Chain tool calls in one response.done batch, not across separate responses
+- Missing "id" field in function_call_output breaks tool result association
+- Chunks >20ms add VAD latency — use 480 samples (20ms) at 24kHz
+```
+</Step>
+
+<Step title="Ask Claude to build or modify your agent">
+```
+> Change the pizza agent persona to a fast food drive-through, add a combo meals tool,
+  and make it push upsells for drinks
+```
+```
+> Add a check_calendar tool to the ENT agent that returns real availability
+  from a Google Calendar API call
+```
+```
+> Tune the VAD settings — the agent keeps interrupting me mid-sentence
 ```
 
-Your client executes the function and sends back the result:
+Claude Code reads your CLAUDE.md, understands the session init sequence and tool calling pattern, and makes changes that actually work.
+</Step>
+
+</Steps>
+
+<Tip>
+Start with one of the example agents above, not a blank file. Claude Code iterates much faster when it has a running baseline to modify rather than generating from scratch.
+</Tip>
+
+---
+
+## How it works
 
-```json
-{
-  "type": "tool.result",
-  "call_id": "call_abc123",
-  "result": "{\"temperature\": \"72°F\", \"conditions\": \"sunny\"}"
-}
 ```
+Client                                     Server
+  |                                           |
+  |--- WebSocket connect -------------------->|
+  |--- session.update (config) -------------->|  ← send immediately, don't wait
+  |--- input_audio_buffer.append ------------>|  ← start streaming right away
+  |                                           |
+  |<------------ session.created -------------|  ← clear buffer here
+  |<------------ speech_started --------------|  user is talking
+  |<------------ speech_stopped --------------|  user finished
+  |<------------ transcription.completed -----|  what the user said
+  |<------------ response.audio.delta --------|  agent speaks back
+  |<------------ response.done ---------------|
+  |                                           |
+```
+
+<Warning>
+**Send config immediately — don't wait for `session.created`.** The server needs audio before it sends `session.created`. Sending config immediately means your instructions and turn detection settings are ready the moment the session is live. When `session.created` arrives, call `input_audio_buffer.clear()` to discard warmup audio and start fresh.
+</Warning>
 
-The agent then speaks the weather information to the user.
+The API is fully compatible with the OpenAI Realtime protocol, so the [OpenAI Python SDK](https://github.com/openai/openai-python), [OpenAI JS SDK](https://github.com/openai/openai-node), [LiveKit Agents](https://docs.livekit.io/agents/), and any OpenAI-compatible client work out of the box — just point them at `wss://speech-to-speech.assemblyai.com/v1`.
 
 ---
 
-## Agent configuration
+## Tool calling
 
-Full list of options when creating an agent.
+Give your agent the ability to call functions — look up data, take actions, call external APIs — then continue the conversation with the result.
 
-| Field                   | Type   | Default  | Description                                         |
-| ----------------------- | ------ | -------- | --------------------------------------------------- |
-| `agent_name`            | string | required | Unique identifier (letters, numbers, underscores)   |
-| `instructions`          | string | -        | Personality and behavior guidelines                 |
-| `voice`                 | string | `"luna"` | Voice to use for responses                          |
-| `greeting`              | string | -        | What the agent says when a conversation starts      |
-| `temperature`           | float  | `0.8`    | Response creativity (0.0 = focused, 1.0 = creative) |
-| `max_tokens`            | int    | `4096`   | Maximum response length                             |
-| `language`              | string | `"en"`   | Language code                                       |
-| `tools`                 | array  | -        | Tool definitions (see above)                        |
-| `audio_in_sample_rate`  | int    | `16000`  | Input audio sample rate in Hz                       |
-| `audio_out_sample_rate` | int    | `16000`  | Output audio sample rate in Hz                      |
+### Define tools
+
+```json
+"tools": [{
+  "type": "function",
+  "name": "check_availability",
+  "description": "Check available appointment slots for a given date",
+  "parameters": {
+    "type": "object",
+    "properties": {
+      "date": {"type": "string", "description": "Date in YYYY-MM-DD format"}
+    },
+    "required": ["date"]
+  }
+}],
+"tool_choice": "auto"
+```
+
+### Handle tool calls
+
+Capture calls via `response.output_item.done`, then inject results on `response.done`. The server auto-generates the follow-up response — no need to call `response.create`.
+
+```python
+import uuid
+
+pending_calls = []
+
+# Step 1: capture when the output item is fully formed
+if et == "response.output_item.done":
+    item = e.get("item", {})
+    if item.get("type") == "function_call":
+        pending_calls.append({
+            "name": item["name"],
+            "call_id": item["call_id"],
+            "arguments": json.loads(item.get("arguments", "{}")),
+        })
+
+# Step 2: execute and inject on response.done
+elif et == "response.done":
+    if pending_calls:
+        calls = pending_calls[:]
+        pending_calls.clear()
+        for call in calls:
+            result = run_tool(call["name"], call["arguments"])
+            await connection.conversation.item.create(item={
+                "id": f"item_{uuid.uuid4().hex[:24]}",  # required
+                "type": "function_call_output",
+                "call_id": call["call_id"],
+                "output": json.dumps(result),
+            })
+```
+
+<Tip>
+Always include the `"id"` field in `function_call_output`. Without it, the server may not properly associate the result with the function call.
+</Tip>
+
+### Stage-based prompting
+
+For multi-step agents, update the instructions at each phase of the conversation. This keeps the model focused and prevents it from repeating information it already said.
+
+<Accordion title="How stage-based prompting works">
+
+Instead of one long system prompt that covers every possible phase, split your agent into stages and update the instructions as the conversation progresses:
+
+```python
+STAGES = {
+    "greeting": "Answer the phone. Find out what the caller needs. No tools yet.",
+    "main_task": "You have all the context now. Take the action and gather any remaining details.",
+    "wrap_up": "The task is done. Confirm the result briefly and say bye. No more tools.",
+}
+
+async def set_stage(name: str):
+    await connection.session.update(session={
+        "type": "realtime",
+        "instructions": STAGES[name],
+    })
+
+# In your event handler:
+# After user speaks a couple times → set_stage("main_task")
+# Before injecting tool result → set_stage("wrap_up")   ← critical timing
+```
+
+**The critical timing:** switch to the wrap-up stage *before* injecting the tool result, not after. The agent responds to the tool result using the *current* instructions at that moment. If you switch to "just give them the confirmation, no repeating" before it sees the result, the follow-up response will be tight and focused.
+
+</Accordion>
 
 ---
 
-## WebSocket events
+## Configuration
 
-When connected to an agent, you'll receive these events:
+### Session format
+
+<Tabs>
 
-### session.created
+<Tab title="OpenAI SDK (nested)">
 
-Sent when the connection is established and ready.
+Used by the OpenAI Python/JS SDK and LiveKit. Include `"type": "realtime"`.
 
 ```json
 {
-  "type": "session.created",
+  "type": "session.update",
   "session": {
-    "id": "uuid",
-    "agent_name": "my_agent"
+    "type": "realtime",
+    "instructions": "You are a helpful voice assistant.",
+    "output_modalities": ["audio", "text"],
+    "audio": {
+      "input": {
+        "format": {"type": "audio/pcm", "rate": 24000},
+        "transcription": {"model": "universal-streaming"},
+        "turn_detection": {
+          "type": "server_vad",
+          "threshold": 0.5,
+          "prefix_padding_ms": 300,
+          "silence_duration_ms": 200
+        }
+      },
+      "output": {
+        "format": {"type": "audio/pcm", "rate": 24000},
+        "voice": "sage"
+      }
+    },
+    "tools": [],
+    "tool_choice": "auto"
   }
 }
 ```
 
-### conversation.item.done
+</Tab>
 
-Sent when a speaker finishes talking. Contains the transcript.
+<Tab title="Raw WebSocket (flat)">
+
+Used when connecting with raw WebSocket.
 
 ```json
 {
-  "type": "conversation.item.done",
-  "item": {
-    "role": "user",
-    "content": [{ "type": "text", "text": "What's the weather like?" }]
+  "type": "session.update",
+  "session": {
+    "instructions": "You are a helpful voice assistant.",
+    "voice": "sage",
+    "input_audio_format": "pcm16",
+    "input_audio_sample_rate": 24000,
+    "output_audio_format": "pcm16",
+    "output_audio_sample_rate": 24000,
+    "input_audio_transcription": {"model": "universal-streaming"},
+    "output_modalities": ["audio", "text"],
+    "turn_detection": {
+      "type": "server_vad",
+      "threshold": 0.5,
+      "prefix_padding_ms": 300,
+      "silence_duration_ms": 200
+    },
+    "tools": [],
+    "tool_choice": "auto"
   }
 }
 ```
 
-### conversation.item.interim
+</Tab>
+
+</Tabs>
+
+### Session parameters
+
+<ParamField path="instructions" type="string">
+  System prompt. Defines the agent's personality, role, and behavior. Can be updated mid-session with `session.update` to implement stage-based prompting.
+</ParamField>
+
+<ParamField path="voice" type="string" default="sage">
+  Voice for agent responses. Options: `sage`, `ember`, `breeze`, `cascade`.
+</ParamField>
+
+<ParamField path="output_modalities" type="array">
+  What the agent returns. `["audio", "text"]` gives you both spoken responses and transcripts.
+</ParamField>
+
+<ParamField path="input_audio_transcription" type="object">
+  Enables real-time transcription of user speech. Set `model` to `"universal-streaming"`.
+</ParamField>
+
+<ParamField path="turn_detection" type="object">
+  Server-side voice activity detection settings.
+</ParamField>
 
-Sent during speech with partial transcripts. Useful for showing real-time captions.
+<ParamField path="tools" type="array" default="[]">
+  Function definitions the agent can call.
+</ParamField>
+
+### Turn detection tuning
 
 ```json
-{
-  "type": "conversation.item.interim",
-  "item": {
-    "role": "user",
-    "content": [{ "type": "text", "text": "What's the wea..." }]
-  }
+"turn_detection": {
+  "type": "server_vad",
+  "threshold": 0.5,
+  "prefix_padding_ms": 300,
+  "silence_duration_ms": 200
 }
 ```
 
-### tool.call
+<ParamField path="threshold" type="float" default="0.5">
+  Speech sensitivity (0.0–1.0). **Raise to 0.6–0.7** if the agent is triggering on background noise or its own audio.
+</ParamField>
+
+<ParamField path="prefix_padding_ms" type="integer" default="300">
+  Audio preserved before speech onset. Prevents clipping the start of sentences.
+</ParamField>
+
+<ParamField path="silence_duration_ms" type="integer" default="200">
+  Pause length before considering a turn complete. **Raise to 400–500ms** if the agent interrupts before the user has finished speaking.
+</ParamField>
 
-Sent when the agent wants to use a tool. See [Add tools](#add-tools) for handling.
+### Voices
 
-### Audio (binary)
+| Voice | ID | Character |
+|-------|----|-----------|
+| Sage | `sage` | Warm, measured, professional |
+| Ember | `ember` | Expressive, energetic |
+| Breeze | `breeze` | Conversational, approachable |
+| Cascade | `cascade` | Clear, authoritative |
 
-The agent's spoken responses come as binary WebSocket frames containing PCM16 audio.
+### Audio format
+
+All audio is **PCM16** (signed 16-bit little-endian), **mono**, **24,000 Hz**, base64-encoded in JSON. Send chunks of approximately 20ms — 480 samples, 960 bytes raw, ~1280 bytes base64.
 
 ---
 
-## Audio format
+## SDK and framework examples
 
-Both input and output audio use the same format:
+<Tabs>
 
-- **Encoding**: PCM16 (16-bit signed integer, little-endian)
-- **Sample rate**: 16,000 Hz (configurable)
-- **Channels**: Mono
+<Tab title="Python (OpenAI SDK)" default>
 
----
+```bash
+pip install openai sounddevice
+```
 
-## REST API reference
+The recommended approach. Full production example with native-rate mic capture, callback-based audio player, interruption handling, and correct tool calling.
 
-<Accordion title="Manage agents">
+<Accordion title="Full example">
 
-**Base URL**: `https://aaigentsv1.up.railway.app`
+```python agent.py
+import asyncio, base64, json, queue, struct, threading, time, uuid
+import sounddevice as sd
+from openai import AsyncOpenAI
 
-All REST endpoints require an `Authorization: YOUR_API_KEY` header.
+API_KEY = "YOUR_ASSEMBLYAI_API_KEY"
+TARGET_RATE = 24000
 
-### Create or update agent
+client = AsyncOpenAI(
+    api_key=API_KEY,
+    websocket_base_url="wss://speech-to-speech.assemblyai.com/v1",
+)
 
-`POST /agents` — Create a new agent or update an existing one.
+TOOLS = [{
+    "type": "function",
+    "name": "get_weather",
+    "description": "Get the current weather for a location",
+    "parameters": {
+        "type": "object",
+        "properties": {"location": {"type": "string", "description": "City name"}},
+        "required": ["location"],
+    },
+}]
 
-### List agents
 
-`GET /agents` — List all your agents.
+def run_tool(name, args):
+    if name == "get_weather":
+        return {"temperature": 72, "condition": "sunny", "location": args["location"]}
+    return {"error": f"Unknown tool: {name}"}
 
-```json
-{
-  "agents": ["agent1", "agent2"],
-  "count": 2
-}
-```
 
-### Get agent
+class AudioPlayer:
+    """Callback-based — play() returns instantly, audio thread does the work."""
+    def __init__(self):
+        self._buf, self._lock = bytearray(), threading.Lock()
+        self._out = sd.RawOutputStream(samplerate=TARGET_RATE, channels=1, dtype="int16",
+            blocksize=480, latency="low", callback=self._callback)
+        self._out.start()
+
+    def _callback(self, outdata, frames, *_):
+        n = frames * 2
+        with self._lock:
+            chunk = bytes(self._buf[:n]); del self._buf[:n]
+        outdata[:] = chunk + b'\x00' * (n - len(chunk))
 
-`GET /agents/{agent_name}` — Get an agent's configuration.
+    def play(self, pcm: bytes):
+        with self._lock: self._buf.extend(pcm)
 
-### Delete agent
+    def clear(self):  # call on speech_started to stop agent audio immediately
+        with self._lock: self._buf.clear()
 
-`DELETE /agents/{agent_name}` — Delete an agent.
+    def close(self): self._out.stop(); self._out.close()
+
+
+async def main():
+    player, q = AudioPlayer(), queue.Queue()
+
+    # Capture at native rate — avoids hidden driver resampling buffers
+    dev = sd.query_devices(kind="input")
+    native_rate = int(dev["default_samplerate"])
+
+    def mic_cb(data, frames, ti, status): q.put_nowait(bytes(data))
+    sd.RawInputStream(samplerate=native_rate, channels=1, dtype="int16",
+        blocksize=int(native_rate * 0.02), callback=mic_cb, latency="low").start()
+
+    conn = await client.realtime.connect(
+        model="universal-streaming",
+        websocket_connection_options={"compression": None},
+    ).enter()
+
+    # Send config IMMEDIATELY — don't wait for session.created
+    await conn.session.update(session={
+        "type": "realtime",
+        "instructions": "You are a helpful voice assistant. Keep responses brief.",
+        "output_modalities": ["audio", "text"],
+        "audio": {
+            "input": {"format": {"type": "audio/pcm", "rate": TARGET_RATE},
+                      "transcription": {"model": "universal-streaming"},
+                      "turn_detection": {"type": "server_vad", "threshold": 0.5,
+                                         "prefix_padding_ms": 300, "silence_duration_ms": 200}},
+            "output": {"format": {"type": "audio/pcm", "rate": TARGET_RATE}, "voice": "sage"},
+        },
+        "tools": TOOLS, "tool_choice": "auto",
+    })
+
+    raw_ws = conn._connection
+    loop = asyncio.get_running_loop()
+    while not q.empty(): q.get_nowait()  # drain stale mic audio
+
+    def resample(pcm: bytes, src: int) -> bytes:
+        if src == TARGET_RATE: return pcm
+        n = len(pcm) // 2; samps = struct.unpack(f"<{n}h", pcm); r = src / TARGET_RATE
+        out = [int(samps[min(int(i*r), n-1)] + (samps[min(int(i*r)+1, n-1)] -
+               samps[min(int(i*r), n-1)]) * (i*r - int(i*r))) for i in range(int(n / r))]
+        return struct.pack(f"<{len(out)}h", *out)
+
+    async def stream_mic():
+        while True:
+            pcm = await loop.run_in_executor(None, q.get)
+            buf = bytearray(pcm)
+            while not q.empty():
+                try: buf.extend(q.get_nowait())
+                except: break
+            await raw_ws.send(json.dumps({"type": "input_audio_buffer.append",
+                "audio": base64.b64encode(resample(bytes(buf), native_rate)).decode()}))
+
+    async def handle_events():
+        pending_calls = []
+        while True:
+            e = json.loads((await conn.recv_bytes()).decode())
+            et = e.get("type", "")
+
+            if et == "response.output_audio.delta":
+                player.play(base64.b64decode(e["delta"])); continue
+
+            t = time.strftime("%H:%M:%S")
+            if et == "session.created":
+                print(f"[{t}] Connected — {e['session']['id']}")
+                await conn.input_audio_buffer.clear()
+            elif et == "input_audio_buffer.speech_started":
+                print(f"[{t}] You speaking"); player.clear()
+            elif et == "conversation.item.input_audio_transcription.completed":
+                if txt := e.get("transcript", ""): print(f"[{t}] You:   {txt}")
+            elif et == "response.output_audio_transcript.done":
+                print(f"[{t}] Agent: {e.get('transcript', '')}")
+            elif et == "response.output_item.done":
+                item = e.get("item", {})
+                if item.get("type") == "function_call":
+                    call_id = item["call_id"]
+                    if not any(c["call_id"] == call_id for c in pending_calls):
+                        pending_calls.append({"name": item["name"], "call_id": call_id,
+                                              "arguments": json.loads(item.get("arguments", "{}"))})
+                        print(f"[{t}] Tool: {item['name']}({item.get('arguments', '{}')})")
+            elif et == "response.done":
+                if pending_calls:
+                    calls = pending_calls[:]; pending_calls.clear()
+                    for call in calls:
+                        result = run_tool(call["name"], call["arguments"])
+                        print(f"[{t}]   → {json.dumps(result)[:80]}")
+                        await conn.conversation.item.create(item={
+                            "id": f"item_{uuid.uuid4().hex[:24]}",
+                            "type": "function_call_output",
+                            "call_id": call["call_id"],
+                            "output": json.dumps(result),
+                        })
+            elif et == "error":
+                print(f"[{t}] Error: {e.get('error', {})}")
+
+    print("Listening — start talking.\n")
+    try:
+        await asyncio.gather(stream_mic(), handle_events())
+    except KeyboardInterrupt:
+        pass
+    finally:
+        player.close(); await conn.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
 
 </Accordion>
 
-<Accordion title="Conversation history">
+</Tab>
+
+<Tab title="JavaScript (Browser)">
 
-### List conversations
+```bash
+npm install openai
+```
 
-`GET /agents/{agent_name}/conversations` — List all conversations for an agent.
+Browser-based voice agent using Web Audio API for low-latency, gapless playback. Browsers can't set auth headers on WebSocket, so connect via a lightweight proxy that injects your API key:
 
-```json
-{
-  "agent_name": "my_agent",
-  "conversations": [
-    {
-      "conversation_id": "uuid",
-      "created_at": "2025-12-18T13:00:00Z"
-    }
-  ],
-  "count": 1
+<Accordion title="Full browser example">
+
+```javascript agent.js
+// Browser voice agent — connect to a proxy that adds your AssemblyAI API key
+// Proxy endpoint: wss://your-server.com/api/s2s
+
+const WS_PROXY_URL = 'wss://your-server.com/api/s2s';
+
+// ── Audio playback ──────────────────────────────────────────────────────────
+// Web Audio API with precise scheduling: buffers play back-to-back with no gaps
+const audioCtx = new AudioContext({ sampleRate: 24000 });
+const gainNode = audioCtx.createGain();
+gainNode.connect(audioCtx.destination);
+let nextPlayTime = 0;
+
+function decodeAndPlay(base64Delta) {
+  const binary = atob(base64Delta);
+  const bytes = new Uint8Array(binary.length);
+  for (let i = 0; i < binary.length; i++) bytes[i] = binary.charCodeAt(i);
+
+  const int16 = new Int16Array(bytes.buffer);
+  const float32 = new Float32Array(int16.length);
+  for (let i = 0; i < int16.length; i++) float32[i] = int16[i] / 32768;
+
+  const buffer = audioCtx.createBuffer(1, float32.length, 24000);
+  buffer.getChannelData(0).set(float32);
+
+  const source = audioCtx.createBufferSource();
+  source.buffer = buffer;
+  source.connect(gainNode);
+
+  const now = audioCtx.currentTime;
+  if (nextPlayTime < now) nextPlayTime = now; // reset if fallen behind
+  source.start(nextPlayTime);
+  nextPlayTime += buffer.duration;
+}
+
+function stopPlayback() {
+  // Recreate AudioContext to immediately stop all scheduled audio (interruption)
+  gainNode.disconnect();
+  const newCtx = new AudioContext({ sampleRate: 24000 });
+  const newGain = newCtx.createGain();
+  newGain.connect(newCtx.destination);
+  Object.assign(audioCtx, newCtx);
+  Object.assign(gainNode, newGain);
+  nextPlayTime = 0;
 }
-```
 
-### Get conversation
+// ── Mic capture ─────────────────────────────────────────────────────────────
+async function startMic(onChunk) {
+  const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+  const ctx = new AudioContext(); // capture at native rate
+  const source = ctx.createMediaStreamSource(stream);
+  const processor = ctx.createScriptProcessor(2048, 1, 1);
+
+  processor.onaudioprocess = (e) => {
+    const inputData = e.inputBuffer.getChannelData(0);
+
+    // Resample from native rate to 24kHz
+    const ratio = ctx.sampleRate / 24000;
+    const outLength = Math.floor(inputData.length / ratio);
+    const pcm16 = new Int16Array(outLength);
+    for (let i = 0; i < outLength; i++) {
+      const srcIdx = i * ratio;
+      const idx = Math.floor(srcIdx);
+      const frac = srcIdx - idx;
+      const s1 = inputData[idx] || 0;
+      const s2 = inputData[Math.min(idx + 1, inputData.length - 1)] || s1;
+      const sample = Math.max(-1, Math.min(1, s1 + frac * (s2 - s1)));
+      pcm16[i] = sample < 0 ? sample * 0x8000 : sample * 0x7FFF;
+    }
 
-`GET /agents/{agent_name}/conversations/{conversation_id}` — Get a specific conversation with all messages.
+    // Convert to base64
+    const bytes = new Uint8Array(pcm16.buffer);
+    let binary = '';
+    for (let i = 0; i < bytes.length; i++) binary += String.fromCharCode(bytes[i]);
+    onChunk(btoa(binary));
+  };
 
-```json
-{
-  "conversation_id": "uuid",
-  "agent_name": "my_agent",
-  "items": [],
-  "created_at": "2025-12-18T13:00:00Z"
+  source.connect(processor);
+  processor.connect(ctx.destination);
+  return { stop: () => { processor.disconnect(); stream.getTracks().forEach(t => t.stop()); } };
 }
-```
 
-</Accordion>
+// ── Main ─────────────────────────────────────────────────────────────────────
+async function startAgent() {
+  // Resume AudioContext on user gesture (browser autoplay policy)
+  await audioCtx.resume();
+
+  const ws = new WebSocket(WS_PROXY_URL);
+  let sessionReady = false;
+
+  ws.onopen = () => {
+    // Send config immediately on open
+    ws.send(JSON.stringify({
+      type: 'session.update',
+      session: {
+        type: 'realtime',
+        instructions: 'You are a helpful voice assistant. Keep responses brief.',
+        output_modalities: ['audio', 'text'],
+        audio: {
+          input: {
+            format: { type: 'audio/pcm', rate: 24000 },
+            transcription: { model: 'universal-streaming' },
+            turn_detection: { type: 'server_vad', threshold: 0.5,
+                               prefix_padding_ms: 300, silence_duration_ms: 200 },
+          },
+          output: { format: { type: 'audio/pcm', rate: 24000 }, voice: 'sage' },
+        },
+      },
+    }));
+    sessionReady = true;
+  };
 
-<Accordion title="Tool definition schema">
+  const pendingCalls = [];
 
-Tools follow JSON Schema format:
+  ws.onmessage = (event) => {
+    const msg = JSON.parse(event.data);
 
-```json
-{
-  "name": "tool_name",
-  "description": "What this tool does",
-  "parameters": {
-    "type": "object",
-    "properties": {
-      "param_name": {
-        "type": "string",
-        "description": "What this parameter is for"
+    // Fast path: audio deltas — decode and schedule immediately
+    if (msg.type === 'response.output_audio.delta') {
+      decodeAndPlay(msg.delta);
+      return;
+    }
+
+    if (msg.type === 'session.created') {
+      console.log('Connected — session', msg.session.id);
+      ws.send(JSON.stringify({ type: 'input_audio_buffer.clear' }));
+    }
+    else if (msg.type === 'input_audio_buffer.speech_started') {
+      console.log('User speaking');
+      stopPlayback(); // interrupt agent
+    }
+    else if (msg.type === 'response.output_audio_transcript.done') {
+      console.log('Agent:', msg.transcript);
+    }
+    else if (msg.type === 'conversation.item.input_audio_transcription.completed') {
+      console.log('You:  ', msg.transcript);
+    }
+    else if (msg.type === 'response.output_item.done') {
+      const item = msg.item;
+      if (item?.type === 'function_call') {
+        pendingCalls.push({ name: item.name, callId: item.call_id,
+                            arguments: JSON.parse(item.arguments || '{}') });
       }
-    },
-    "required": ["param_name"]
+    }
+    else if (msg.type === 'response.done') {
+      if (pendingCalls.length > 0) {
+        const calls = pendingCalls.splice(0);
+        for (const call of calls) {
+          const result = runTool(call.name, call.arguments);
+          ws.send(JSON.stringify({
+            type: 'conversation.item.create',
+            item: {
+              id: `item_${crypto.randomUUID().replace(/-/g, '').slice(0, 24)}`,
+              type: 'function_call_output',
+              call_id: call.callId,
+              output: JSON.stringify(result),
+            },
+          }));
+        }
+      }
+    }
+    else if (msg.type === 'error') {
+      console.error('Error:', msg.error);
+    }
+  };
+
+  // Start mic and forward audio
+  const mic = await startMic((base64Audio) => {
+    if (ws.readyState === WebSocket.OPEN && sessionReady) {
+      ws.send(JSON.stringify({ type: 'input_audio_buffer.append', audio: base64Audio }));
+    }
+  });
+
+  return { stop: () => { mic.stop(); ws.close(); } };
+}
+
+function runTool(name, args) {
+  if (name === 'get_weather') {
+    return { temperature: 72, condition: 'sunny', location: args.location };
   }
+  return { error: `Unknown tool: ${name}` };
 }
+
+// Start on button click (required for AudioContext autoplay policy)
+document.getElementById('start').addEventListener('click', startAgent);
 ```
 
-**Supported parameter types**: `string`, `number`, `boolean`, `array`, `object`
+</Accordion>
+
+<Note>
+Browsers can't set custom headers on WebSocket connections. Run a minimal proxy that forwards the connection with your API key injected:
+
+```python proxy.py
+from fastapi import FastAPI, WebSocket
+from openai import AsyncOpenAI
+import asyncio, json
+
+app = FastAPI()
+client = AsyncOpenAI(api_key="YOUR_ASSEMBLYAI_API_KEY",
+                     websocket_base_url="wss://speech-to-speech.assemblyai.com/v1")
+
+@app.websocket("/api/s2s")
+async def proxy(websocket: WebSocket):
+    await websocket.accept()
+    conn = await client.realtime.connect(
+        model="universal-streaming",
+        websocket_connection_options={"compression": None},
+    ).enter()
+    raw_ws = conn._connection
+
+    async def from_client():
+        try:
+            while True:
+                await raw_ws.send(await websocket.receive_text())
+        except Exception: pass
+
+    async def from_server():
+        try:
+            while True:
+                await websocket.send_text((await conn.recv_bytes()).decode())
+        except Exception: pass
+
+    _, pending = await asyncio.wait(
+        [asyncio.create_task(from_client()), asyncio.create_task(from_server())],
+        return_when=asyncio.FIRST_COMPLETED,
+    )
+    for t in pending: t.cancel()
+    await conn.close()
+```
+
+Then connect the browser to `ws://localhost:8000/api/s2s`.
+</Note>
+
+</Tab>
+
+<Tab title="Raw WebSocket (Python)">
+
+```bash
+pip install websockets sounddevice
+```
+
+Direct WebSocket control. Use this if you prefer the flat session format or can't use the OpenAI SDK.
+
+<Accordion title="Full example">
+
+```python agent_ws.py
+import asyncio, base64, json, queue, threading, time, uuid
+import sounddevice as sd
+import websockets
+
+API_KEY = "YOUR_ASSEMBLYAI_API_KEY"
+WS_URL = "wss://speech-to-speech.assemblyai.com/v1/realtime"
+SAMPLE_RATE = 24000
+
+TOOLS = [{
+    "type": "function",
+    "name": "get_weather",
+    "description": "Get the current weather for a location",
+    "parameters": {
+        "type": "object",
+        "properties": {"location": {"type": "string", "description": "City name"}},
+        "required": ["location"],
+    },
+}]
+
+
+def run_tool(name, args):
+    if name == "get_weather":
+        return {"temperature": 72, "condition": "sunny", "location": args["location"]}
+    return {"error": f"Unknown tool: {name}"}
+
+
+class AudioPlayer:
+    def __init__(self):
+        self._buf, self._lock = bytearray(), threading.Lock()
+        self._out = sd.RawOutputStream(samplerate=SAMPLE_RATE, channels=1, dtype="int16",
+            blocksize=480, latency="low", callback=self._callback)
+        self._out.start()
+
+    def _callback(self, outdata, frames, *_):
+        n = frames * 2
+        with self._lock:
+            chunk = bytes(self._buf[:n]); del self._buf[:n]
+        outdata[:] = chunk + b'\x00' * (n - len(chunk))
+
+    def play(self, pcm):
+        with self._lock: self._buf.extend(pcm)
+
+    def clear(self):
+        with self._lock: self._buf.clear()
+
+    def close(self): self._out.stop(); self._out.close()
+
+
+async def main():
+    player, q = AudioPlayer(), queue.Queue()
+
+    def mic_cb(data, frames, ti, status): q.put_nowait(bytes(data))
+    sd.RawInputStream(samplerate=SAMPLE_RATE, channels=1, dtype="int16",
+        blocksize=480, callback=mic_cb, latency="low").start()
+
+    # websockets 13.x uses extra_headers; 14.x+ uses additional_headers
+    try:
+        ws = await websockets.connect(WS_URL, extra_headers={"Authorization": f"Bearer {API_KEY}"})
+    except TypeError:
+        ws = await websockets.connect(WS_URL, additional_headers={"Authorization": f"Bearer {API_KEY}"})
+
+    # Send config immediately
+    await ws.send(json.dumps({"type": "session.update", "session": {
+        "input_audio_format": "pcm16", "input_audio_sample_rate": SAMPLE_RATE,
+        "output_audio_format": "pcm16", "output_audio_sample_rate": SAMPLE_RATE,
+        "input_audio_transcription": {"model": "universal-streaming"},
+        "turn_detection": {"type": "server_vad", "threshold": 0.5,
+                           "prefix_padding_ms": 300, "silence_duration_ms": 200},
+        "output_modalities": ["audio", "text"],
+        "instructions": "You are a helpful voice assistant. Keep responses brief.",
+        "voice": "sage",
+        "tools": TOOLS,
+        "tool_choice": "auto",
+    }}))
+
+    loop = asyncio.get_running_loop()
+    while not q.empty(): q.get_nowait()
+
+    async def stream_mic():
+        while True:
+            pcm = await loop.run_in_executor(None, q.get)
+            buf = bytearray(pcm)
+            while not q.empty():
+                try: buf.extend(q.get_nowait())
+                except: break
+            await ws.send(json.dumps({"type": "input_audio_buffer.append",
+                "audio": base64.b64encode(bytes(buf)).decode()}))
+
+    async def handle_events():
+        pending_calls = []
+        async for raw in ws:
+            e = json.loads(raw)
+            et = e.get("type", "")
+            t = time.strftime("%H:%M:%S")
+
+            if et == "session.created":
+                print(f"[{t}] Connected — {e['session']['id']}")
+                await ws.send(json.dumps({"type": "input_audio_buffer.clear"}))
+            elif et == "input_audio_buffer.speech_started":
+                print(f"[{t}] You speaking"); player.clear()
+            elif et == "conversation.item.input_audio_transcription.completed":
+                if txt := e.get("transcript", ""): print(f"[{t}] You:   {txt}")
+            elif et == "response.output_audio.delta":
+                player.play(base64.b64decode(e["delta"]))
+            elif et == "response.output_audio_transcript.done":
+                print(f"[{t}] Agent: {e.get('transcript', '')}")
+            elif et == "response.output_item.done":
+                item = e.get("item", {})
+                if item.get("type") == "function_call":
+                    call_id = item["call_id"]
+                    if not any(c["call_id"] == call_id for c in pending_calls):
+                        pending_calls.append({"name": item["name"], "call_id": call_id,
+                                              "arguments": json.loads(item.get("arguments", "{}"))})
+            elif et == "response.done":
+                s = e.get("response", {}).get("status", "?")
+                print(f"[{t}] Done ({s})")
+                if pending_calls:
+                    calls = pending_calls[:]; pending_calls.clear()
+                    for call in calls:
+                        result = run_tool(call["name"], call["arguments"])
+                        await ws.send(json.dumps({
+                            "type": "conversation.item.create",
+                            "item": {"id": f"item_{uuid.uuid4().hex[:24]}",
+                                     "type": "function_call_output",
+                                     "call_id": call["call_id"],
+                                     "output": json.dumps(result)},
+                        }))
+            elif et == "error":
+                print(f"[{t}] Error: {e.get('error', {})}")
+
+    print("Listening — start talking.\n")
+    try:
+        await asyncio.gather(stream_mic(), handle_events())
+    except KeyboardInterrupt:
+        pass
+    finally:
+        player.close(); await ws.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
 
 </Accordion>
+
+</Tab>
+
+<Tab title="LiveKit Agents">
+
+```bash
+pip install "livekit-agents[openai,silero]" python-dotenv
+```
+
+Uses the [LiveKit Agents framework](https://docs.livekit.io/agents/) for production deployments with WebRTC transport, room management, and client SDKs for web and mobile.
+
+```python agent.py
+import asyncio, os
+from dotenv import load_dotenv
+from livekit.agents import Agent, AgentServer, AgentSession, JobContext, JobProcess, RunContext, cli, function_tool
+from livekit.plugins import openai, silero
+from openai.types.beta.realtime.session import TurnDetection
+from openai.types.realtime import AudioTranscription
+
+load_dotenv()
+
+
+class VoiceAgent(Agent):
+    def __init__(self):
+        super().__init__(instructions="You are a helpful voice assistant. Keep responses brief.")
+
+    @function_tool
+    async def get_weather(self, context: RunContext, location: str):
+        """Get the current weather for a location.
+
+        Args:
+            location: City name
+        """
+        return f"72 degrees and sunny in {location}."
+
+
+server = AgentServer()
+
+
+def prewarm(proc: JobProcess):
+    proc.userdata["vad"] = silero.VAD.load()
+
+
+server.setup_fnc = prewarm
+
+
+@server.rtc_session()
+async def entrypoint(ctx: JobContext):
+    session = AgentSession(
+        llm=openai.realtime.RealtimeModel(
+            base_url="wss://speech-to-speech.assemblyai.com/v1",
+            api_key=os.environ["ASSEMBLYAI_API_KEY"],
+            model="universal-streaming",
+            voice="sage",
+            input_audio_transcription=AudioTranscription(model="universal-streaming"),
+            turn_detection=TurnDetection(
+                type="server_vad",
+                threshold=0.5,
+                prefix_padding_ms=300,
+                silence_duration_ms=200,
+            ),
+        )
+    )
+    await session.start(agent=VoiceAgent(), room=ctx.room)
+    await ctx.connect()
+
+
+if __name__ == "__main__":
+    cli.run_app(server)
+```
+
+```bash
+python agent.py console
+```
+
+</Tab>
+
+</Tabs>
+
+---
+
+## Events reference
+
+### Client → Server
+
+| Event | Description | Key fields |
+|-------|-------------|------------|
+| `session.update` | Configure the session | `session`: config object |
+| `input_audio_buffer.append` | Stream an audio chunk | `audio`: base64 PCM16 |
+| `input_audio_buffer.commit` | Commit buffered audio as a user turn | — |
+| `input_audio_buffer.clear` | Discard buffered audio | — |
+| `conversation.item.create` | Add a message or tool result | `item` |
+| `conversation.item.delete` | Remove a conversation item | `item_id` |
+| `response.create` | Trigger the agent to respond | — |
+| `response.cancel` | Cancel an in-progress response | — |
+
+### Server → Client
+
+| Event | Description | Key fields |
+|-------|-------------|------------|
+| `session.created` | Session initialized — clear buffer here | `session.id` |
+| `session.updated` | Config applied | `session` |
+| `input_audio_buffer.speech_started` | User started speaking | `audio_start_ms` |
+| `input_audio_buffer.speech_stopped` | User stopped speaking | `audio_end_ms` |
+| `input_audio_buffer.committed` | Audio committed as a turn | — |
+| `conversation.item.created` | New item added | `item` |
+| `conversation.item.input_audio_transcription.completed` | User speech transcribed | `transcript` |
+| `response.created` | Agent started generating | — |
+| `response.output_item.done` | Output item complete — capture tool calls here | `item` |
+| `response.output_audio.delta` | Agent audio chunk | `delta`: base64 PCM16 |
+| `response.output_audio.done` | Agent audio complete | — |
+| `response.output_audio_transcript.delta` | Agent text (streaming) | `delta` |
+| `response.output_audio_transcript.done` | Agent text (final) | `transcript` |
+| `response.done` | Response complete — inject tool results here | `response.status` |
+| `error` | Error occurred | `error.message` |
+
+---
+
+## What's next
+
+<CardGroup cols={3}>
+  <Card title="Universal Streaming STT" href="/docs/speech-to-text/universal-streaming">
+    Use AssemblyAI's STT on its own — real-time transcription with speaker diarization, word timestamps, and more.
+  </Card>
+  <Card title="LeMUR" href="/docs/lemur">
+    Apply LLMs to audio — summarize calls, extract action items, answer questions about recordings.
+  </Card>
+  <Card title="Audio Intelligence" href="/docs/audio-intelligence">
+    Sentiment analysis, topic detection, PII redaction, and more on top of transcription.
+  </Card>
+</CardGroup>