diff --git a/fern/assets/components/AgentGenerator.tsx b/fern/assets/components/AgentGenerator.tsx new file mode 100644 index 000000000..1c09db552 --- /dev/null +++ b/fern/assets/components/AgentGenerator.tsx @@ -0,0 +1,453 @@ +"use client"; +import * as React from "react"; + +const DOCS_URL = + "https://www.assemblyai.com/docs/speech-to-text/voice-agents/speechtospeech"; + +type OutputFormat = "python" | "javascript" | "config"; + +const LLM_CONTEXT = `You are an expert at building real-time voice agents using the AssemblyAI Speech-to-Speech API. Based on the user's description, generate a complete voice agent implementation. + +## AssemblyAI Speech-to-Speech API Reference + +Endpoint: wss://speech-to-speech.assemblyai.com/v1/realtime +Auth: Authorization: Bearer YOUR_ASSEMBLYAI_API_KEY header on WebSocket connect +Audio: PCM16 (signed 16-bit little-endian), mono, 24000 Hz, base64-encoded in JSON +Voices: sage (default), ember, breeze, cascade + +### Session config (flat format, for raw WebSocket): +{ + "type": "session.update", + "session": { + "instructions": "System prompt here", + "voice": "sage", + "input_audio_format": "pcm16", + "input_audio_sample_rate": 24000, + "output_audio_format": "pcm16", + "output_audio_sample_rate": 24000, + "input_audio_transcription": {"model": "universal-streaming"}, + "output_modalities": ["audio", "text"], + "turn_detection": {"type": "server_vad", "threshold": 0.5, "prefix_padding_ms": 300, "silence_duration_ms": 200}, + "tools": [], + "tool_choice": "auto" + } +} + +### Tool definition schema: +{ + "type": "function", + "name": "tool_name", + "description": "What this tool does", + "parameters": { + "type": "object", + "properties": { + "param_name": {"type": "string", "description": "Param description"} + }, + "required": ["param_name"] + } +} + +### Tool calling pattern: +- On "response.function_call_arguments.done": Start executing the function immediately (use asyncio.create_task) +- On "response.done": Send results back via "conversation.item.create" with type "function_call_output" +- Do NOT send "response.create" after tool results — the server continues automatically +- Interruptions are handled automatically by server-side VAD — no client logic needed + +### Key events: +Client sends: session.update, input_audio_buffer.append (base64 audio) +Server sends: session.created, input_audio_buffer.speech_started, input_audio_buffer.speech_stopped, conversation.item.input_audio_transcription.completed, response.output_audio.delta (base64 audio), response.output_audio_transcript.done, response.function_call_arguments.done, response.done + +### Python quickstart template (raw WebSocket with websockets + sounddevice): +\`\`\`python +import asyncio, base64, json, threading, time +import sounddevice as sd +import websockets + +API_KEY = "YOUR_ASSEMBLYAI_API_KEY" +WS_URL = "wss://speech-to-speech.assemblyai.com/v1/realtime" +SAMPLE_RATE = 24000 + +TOOLS = [] # Add tool definitions here + +async def run_tool(name, args): + # Implement tool logic here + return {"error": f"Unknown tool: {name}"} + +class AudioPlayer: + def __init__(self): + self._buf = bytearray() + self._lock = threading.Lock() + self._out = sd.RawOutputStream(samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480, latency="low") + self._out.start() + def play(self, pcm): + with self._lock: + self._buf.extend(pcm) + while len(self._buf) >= 960: + self._out.write(bytes(self._buf[:960])) + del self._buf[:960] + def close(self): + self._out.stop(); self._out.close() + +async def main(): + player = AudioPlayer() + q = asyncio.Queue() + def mic_cb(data, frames, ti, status): + q.put_nowait(bytes(data)) + mic = sd.RawInputStream(samplerate=SAMPLE_RATE, channels=1, dtype="int16", blocksize=480, callback=mic_cb, latency="low") + mic.start() + + ws = await websockets.connect(WS_URL, additional_headers={"Authorization": f"Bearer {API_KEY}"}) + await ws.send(json.dumps({"type": "session.update", "session": { + "input_audio_format": "pcm16", "input_audio_sample_rate": SAMPLE_RATE, + "output_audio_format": "pcm16", "output_audio_sample_rate": SAMPLE_RATE, + "input_audio_transcription": {"model": "universal-streaming"}, + "turn_detection": {"type": "server_vad", "threshold": 0.5, "prefix_padding_ms": 300, "silence_duration_ms": 200}, + "output_modalities": ["audio", "text"], + "instructions": "SYSTEM_PROMPT_HERE", + "voice": "sage", + "tools": TOOLS, + "tool_choice": "auto", + }})) + pending_tasks = {} + async def stream_mic(): + while True: + try: + pcm = await asyncio.wait_for(q.get(), timeout=0.1) + await ws.send(json.dumps({"type": "input_audio_buffer.append", "audio": base64.b64encode(pcm).decode()})) + except asyncio.TimeoutError: + pass + async def handle_events(): + async for raw in ws: + e = json.loads(raw) + et = e.get("type", "") + t = time.strftime("%H:%M:%S") + if et == "session.created": + print(f"[{t}] Connected — session {e['session']['id']}") + elif et == "conversation.item.input_audio_transcription.completed": + print(f"[{t}] You: {e.get('transcript', '')}") + elif et == "response.output_audio.delta": + player.play(base64.b64decode(e["delta"])) + elif et == "response.output_audio_transcript.done": + print(f"[{t}] Agent: {e.get('transcript', '')}") + elif et == "response.function_call_arguments.done": + args = json.loads(e["arguments"]) + pending_tasks[e["call_id"]] = asyncio.create_task(run_tool(e["name"], args)) + print(f"[{t}] Tool: {e['name']}({e['arguments']})") + elif et == "response.done": + if pending_tasks and e.get("response", {}).get("status") == "completed": + for cid, task in pending_tasks.items(): + result = await task + await ws.send(json.dumps({"type": "conversation.item.create", "item": {"type": "function_call_output", "call_id": cid, "output": json.dumps(result)}})) + pending_tasks.clear() + print("Listening — start talking.\\n") + try: + await asyncio.gather(stream_mic(), handle_events()) + except KeyboardInterrupt: + pass + finally: + mic.stop(); mic.close(); player.close(); await ws.close() + +if __name__ == "__main__": + asyncio.run(main()) +\`\`\` + +### JavaScript quickstart template (raw WebSocket in Node.js): +\`\`\`javascript +// Requires: npm install ws +const WebSocket = require("ws"); +const API_KEY = "YOUR_ASSEMBLYAI_API_KEY"; +const WS_URL = "wss://speech-to-speech.assemblyai.com/v1/realtime"; + +const TOOLS = []; // Add tool definitions here + +function runTool(name, args) { + // Implement tool logic here + return { error: "Unknown tool: " + name }; +} + +const ws = new WebSocket(WS_URL, { headers: { Authorization: "Bearer " + API_KEY } }); +const pendingTasks = new Map(); + +ws.on("open", () => { + ws.send(JSON.stringify({ type: "session.update", session: { + input_audio_format: "pcm16", input_audio_sample_rate: 24000, + output_audio_format: "pcm16", output_audio_sample_rate: 24000, + input_audio_transcription: { model: "universal-streaming" }, + turn_detection: { type: "server_vad", threshold: 0.5, prefix_padding_ms: 300, silence_duration_ms: 200 }, + output_modalities: ["audio", "text"], + instructions: "SYSTEM_PROMPT_HERE", + voice: "sage", + tools: TOOLS, + tool_choice: "auto", + }})); + // Start streaming mic audio as base64 PCM16 via input_audio_buffer.append +}); + +ws.on("message", async (raw) => { + const e = JSON.parse(raw); + if (e.type === "response.output_audio.delta") { + // Play base64-decoded PCM16 audio: Buffer.from(e.delta, "base64") + } else if (e.type === "response.output_audio_transcript.done") { + console.log("Agent:", e.transcript); + } else if (e.type === "response.function_call_arguments.done") { + pendingTasks.set(e.call_id, runTool(e.name, JSON.parse(e.arguments))); + } else if (e.type === "response.done" && pendingTasks.size > 0) { + for (const [callId, resultPromise] of pendingTasks) { + const result = await resultPromise; + ws.send(JSON.stringify({ type: "conversation.item.create", item: { type: "function_call_output", call_id: callId, output: JSON.stringify(result) } })); + } + pendingTasks.clear(); + } +}); +\`\`\` + +Full documentation: ${DOCS_URL}`; + +const FORMAT_INSTRUCTIONS: Record = { + python: `Generate a COMPLETE, RUNNABLE Python script using the raw WebSocket template above. Include: +1. A detailed system prompt in the instructions field tailored to the agent's purpose +2. All tool definitions in the TOOLS array with proper JSON schemas +3. Full run_tool() implementation with realistic mock data for each tool +4. All imports, the AudioPlayer class, mic handling, and event loop — everything needed to pip install and run +Choose an appropriate voice from: sage, ember, breeze, cascade.`, + + javascript: `Generate a COMPLETE, RUNNABLE JavaScript/Node.js script using the JS WebSocket template above. Include: +1. A detailed system prompt in the instructions field tailored to the agent's purpose +2. All tool definitions in the TOOLS array with proper JSON schemas +3. Full runTool() implementation with realistic mock data for each tool +4. All requires, WebSocket setup, and event handling — everything needed to npm install and run +Choose an appropriate voice from: sage, ember, breeze, cascade. +For audio I/O, use a comment placeholder since Node.js audio libraries vary.`, + + config: `Generate ONLY the session configuration JSON (the session.update payload) with: +1. A detailed system prompt in the instructions field tailored to the agent's purpose +2. All tool definitions in the tools array with proper JSON schemas +3. An appropriate voice chosen from: sage, ember, breeze, cascade +4. All audio format and turn detection settings filled in +Output ONLY the JSON — no script wrapper.`, +}; + +const MAX_DESCRIPTION_CHARS = 2000; +const MAX_URL_LENGTH = 8000; + +const truncateAtWordBoundary = (text: string, maxLength: number): string => { + if (text.length <= maxLength) return text; + const truncated = text.substring(0, maxLength); + const lastSpace = truncated.lastIndexOf(" "); + if (lastSpace > 50) return text.substring(0, lastSpace); + return truncated; +}; + +export function AgentGenerator() { + const [description, setDescription] = React.useState(""); + const [format, setFormat] = React.useState("python"); + + const buildPrompt = (maxContentLength?: number) => { + let descText = description || "(No description provided — generate a general-purpose helpful voice assistant)"; + + if (descText.length > MAX_DESCRIPTION_CHARS) { + descText = truncateAtWordBoundary(descText, MAX_DESCRIPTION_CHARS) + "\n\n[Description truncated]"; + } + + if (maxContentLength && descText.length > maxContentLength) { + descText = truncateAtWordBoundary(descText, maxContentLength) + "\n\n[Description truncated]"; + } + + return `${LLM_CONTEXT} + +## Output format +${FORMAT_INSTRUCTIONS[format]} + +## User's agent description +${descText}`; + }; + + const getMaxContentLength = (baseUrl: string) => { + const promptWithoutDesc = buildPrompt(0).replace( + description || "(No description provided — generate a general-purpose helpful voice assistant)", + "" + ); + const encodedBaseLength = baseUrl.length + encodeURIComponent(promptWithoutDesc).length; + const available = MAX_URL_LENGTH - encodedBaseLength; + return Math.floor(available / 3); + }; + + const openInClaude = () => { + const baseUrl = "https://claude.ai/new?q="; + const maxLen = getMaxContentLength(baseUrl); + const prompt = encodeURIComponent(buildPrompt(maxLen)); + window.open(`${baseUrl}${prompt}`, "_blank"); + }; + + const openInChatGPT = () => { + const baseUrl = "https://chat.openai.com/?q="; + const maxLen = getMaxContentLength(baseUrl); + const prompt = encodeURIComponent(buildPrompt(maxLen)); + window.open(`${baseUrl}${prompt}`, "_blank"); + }; + + const openInGemini = () => { + const baseUrl = "https://aistudio.google.com/prompts/new_chat?prompt="; + const maxLen = getMaxContentLength(baseUrl); + const prompt = encodeURIComponent(buildPrompt(maxLen)); + window.open(`${baseUrl}${prompt}`, "_blank"); + }; + + const containerStyle: React.CSSProperties = { + border: "1px solid var(--grayscale-a4, #e5e7eb)", + borderRadius: "8px", + padding: "24px", + backgroundColor: "var(--grayscale-2, #f9fafb)", + }; + + const labelStyle: React.CSSProperties = { + display: "block", + fontSize: "14px", + fontWeight: 500, + marginBottom: "8px", + color: "var(--grayscale-12, #111827)", + }; + + const textareaStyle: React.CSSProperties = { + width: "100%", + height: "120px", + padding: "12px", + border: "1px solid var(--grayscale-a4, #d1d5db)", + borderRadius: "6px", + fontSize: "14px", + fontFamily: "inherit", + resize: "vertical", + backgroundColor: "var(--grayscale-1, #ffffff)", + color: "var(--grayscale-12, #111827)", + }; + + const charCountStyle: React.CSSProperties = { + fontSize: "12px", + color: "var(--grayscale-11, #6b7280)", + marginTop: "4px", + }; + + const toggleContainerStyle: React.CSSProperties = { + display: "flex", + gap: "4px", + padding: "4px", + backgroundColor: "var(--grayscale-a3, #e5e7eb)", + borderRadius: "6px", + width: "fit-content", + }; + + const toggleButtonStyle = (active: boolean): React.CSSProperties => ({ + padding: "6px 16px", + border: "none", + borderRadius: "4px", + fontSize: "13px", + fontWeight: 500, + cursor: "pointer", + backgroundColor: active ? "var(--grayscale-1, #ffffff)" : "transparent", + color: active ? "var(--grayscale-12, #111827)" : "var(--grayscale-11, #6b7280)", + boxShadow: active ? "0 1px 2px rgba(0,0,0,0.08)" : "none", + transition: "all 0.15s ease", + }); + + const buttonBaseStyle: React.CSSProperties = { + display: "inline-flex", + alignItems: "center", + gap: "8px", + padding: "10px 20px", + border: "none", + borderRadius: "6px", + fontSize: "14px", + fontWeight: 500, + cursor: "pointer", + color: "#ffffff", + }; + + const helpTextStyle: React.CSSProperties = { + marginTop: "12px", + fontSize: "13px", + color: "var(--grayscale-11, #6b7280)", + }; + + return ( +
+
+
+ +