diff --git a/src-tauri/src/bin/codex_monitor_daemon.rs b/src-tauri/src/bin/codex_monitor_daemon.rs index 59bfc00bc..3fa571492 100644 --- a/src-tauri/src/bin/codex_monitor_daemon.rs +++ b/src-tauri/src/bin/codex_monitor_daemon.rs @@ -1329,6 +1329,25 @@ impl DaemonState { .await } + async fn generate_message_audio_summary( + &self, + workspace_id: String, + response_text: String, + model_id: Option, + ) -> Result { + codex_aux_core::generate_message_audio_summary_core( + &self.sessions, + &self.workspaces, + workspace_id, + &response_text, + model_id.as_deref(), + |workspace_id, thread_id| { + emit_background_thread_hide(&self.event_sink, workspace_id, thread_id); + }, + ) + .await + } + async fn local_usage_snapshot( &self, days: Option, diff --git a/src-tauri/src/bin/codex_monitor_daemon/rpc/codex.rs b/src-tauri/src/bin/codex_monitor_daemon/rpc/codex.rs index cc278938e..f30f7e858 100644 --- a/src-tauri/src/bin/codex_monitor_daemon/rpc/codex.rs +++ b/src-tauri/src/bin/codex_monitor_daemon/rpc/codex.rs @@ -505,6 +505,23 @@ pub(super) async fn try_handle( .and_then(|value| serde_json::to_value(value).map_err(|err| err.to_string())), ) } + "generate_message_audio_summary" => { + let workspace_id = match parse_string(params, "workspaceId") { + Ok(value) => value, + Err(err) => return Some(Err(err)), + }; + let response_text = match parse_string(params, "responseText") { + Ok(value) => value, + Err(err) => return Some(Err(err)), + }; + let model_id = parse_optional_string(params, "modelId"); + Some( + state + .generate_message_audio_summary(workspace_id, response_text, model_id) + .await + .and_then(|value| serde_json::to_value(value).map_err(|err| err.to_string())), + ) + } _ => None, } } diff --git a/src-tauri/src/codex/mod.rs b/src-tauri/src/codex/mod.rs index e55d1e9ee..bf6f5f220 100644 --- a/src-tauri/src/codex/mod.rs +++ b/src-tauri/src/codex/mod.rs @@ -1022,3 +1022,51 @@ pub(crate) async fn generate_agent_description( ) .await } + +#[tauri::command] +pub(crate) async fn generate_message_audio_summary( + workspace_id: String, + response_text: String, + model_id: Option, + state: State<'_, AppState>, + app: AppHandle, +) -> Result { + if remote_backend::is_remote_mode(&*state).await { + let value = remote_backend::call_remote( + &*state, + app, + "generate_message_audio_summary", + json!({ + "workspaceId": workspace_id, + "responseText": response_text, + "modelId": model_id, + }), + ) + .await?; + return serde_json::from_value(value).map_err(|err| err.to_string()); + } + + crate::shared::codex_aux_core::generate_message_audio_summary_core( + &state.sessions, + &state.workspaces, + workspace_id, + &response_text, + model_id.as_deref(), + |workspace_id, thread_id| { + let _ = app.emit( + "app-server-event", + AppServerEvent { + workspace_id: workspace_id.to_string(), + message: json!({ + "method": "codex/backgroundThread", + "params": { + "threadId": thread_id, + "action": "hide" + } + }), + }, + ); + }, + ) + .await +} diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index 83e9dacae..0f46aa144 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -218,6 +218,7 @@ pub fn run() { codex::generate_commit_message, codex::generate_run_metadata, codex::generate_agent_description, + codex::generate_message_audio_summary, codex::resume_thread, codex::read_thread, codex::thread_live_subscribe, diff --git a/src-tauri/src/shared/codex_aux_core.rs b/src-tauri/src/shared/codex_aux_core.rs index 12f985ade..4853c07e5 100644 --- a/src-tauri/src/shared/codex_aux_core.rs +++ b/src-tauri/src/shared/codex_aux_core.rs @@ -20,6 +20,17 @@ Keep the summary line under 72 characters. \ Only output the commit message, nothing else.\n\n\ Changes:\n{diff}"; +const DEFAULT_MESSAGE_AUDIO_SUMMARY_PROMPT: &str = + "You are preparing audio playback for a coding assistant response.\n\ +Summarize the response below into short spoken prose for a developer.\n\n\ +Requirements:\n\ +- Return plain text only.\n\ +- Do not include markdown fences, bullets, or headings.\n\ +- Keep it concise and easy to listen to.\n\ +- Preserve important commands, file paths, errors, results, and next actions when they matter.\n\ +- If the response is mostly code or a table, summarize the outcome instead of reading every line.\n\n\ +Assistant response:\n{response}"; + #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] #[serde(rename_all = "camelCase")] pub(crate) struct GeneratedAgentConfiguration { @@ -50,6 +61,15 @@ pub(crate) fn build_commit_message_prompt_for_diff( Ok(build_commit_message_prompt(diff, template)) } +pub(crate) fn build_message_audio_summary_prompt(response_text: &str) -> Result { + let cleaned_response = response_text.trim(); + if cleaned_response.is_empty() { + return Err("Response text is required.".to_string()); + } + + Ok(DEFAULT_MESSAGE_AUDIO_SUMMARY_PROMPT.replace("{response}", cleaned_response)) +} + pub(crate) fn build_run_metadata_prompt(cleaned_prompt: &str) -> String { format!( "You create concise run metadata for a coding task.\n\ @@ -198,6 +218,28 @@ pub(crate) fn parse_agent_description_value( Err("No valid agent configuration was generated".to_string()) } +pub(crate) fn normalize_message_audio_summary_value(raw: &str) -> Result { + let normalized = raw + .lines() + .map(str::trim) + .filter(|line| !line.is_empty() && !line.starts_with("```")) + .map(|line| { + line.strip_prefix("- ") + .or_else(|| line.strip_prefix("* ")) + .or_else(|| line.strip_prefix("• ")) + .unwrap_or(line) + }) + .collect::>() + .join(" "); + + let normalized = normalized.split_whitespace().collect::>().join(" "); + if normalized.is_empty() { + return Err("No summary was generated".to_string()); + } + + Ok(normalized) +} + pub(crate) fn parse_run_metadata_value(raw: &str) -> Result { let trimmed = raw.trim(); if trimmed.is_empty() { @@ -649,10 +691,38 @@ where parse_agent_description_value(&response) } +pub(crate) async fn generate_message_audio_summary_core( + sessions: &Mutex>>, + workspaces: &Mutex>, + workspace_id: String, + response_text: &str, + model: Option<&str>, + on_hide_thread: F, +) -> Result +where + F: Fn(&str, &str), +{ + let prompt = build_message_audio_summary_prompt(response_text)?; + let response = run_background_prompt_core( + sessions, + workspaces, + workspace_id, + prompt, + model, + on_hide_thread, + "Timeout waiting for audio summary generation", + "Unknown error during audio summary generation", + ) + .await?; + + normalize_message_audio_summary_value(&response) +} + #[cfg(test)] mod tests { use super::{ - build_commit_message_prompt_for_diff, parse_agent_description_value, + build_commit_message_prompt_for_diff, build_message_audio_summary_prompt, + normalize_message_audio_summary_value, parse_agent_description_value, parse_run_metadata_value, }; @@ -665,6 +735,22 @@ mod tests { ); } + #[test] + fn build_message_audio_summary_prompt_requires_response_text() { + let result = build_message_audio_summary_prompt(" "); + assert_eq!(result.expect_err("should fail"), "Response text is required."); + } + + #[test] + fn normalize_message_audio_summary_value_flattens_bullets_and_fences() { + let result = normalize_message_audio_summary_value( + "```md\n- Updated src/App.tsx\n- Ran npm run test\n```", + ) + .expect("summary should parse"); + + assert_eq!(result, "Updated src/App.tsx Ran npm run test"); + } + #[test] fn parse_run_metadata_value_normalizes_worktree_name_alias() { let raw = diff --git a/src/features/app/hooks/useMainAppLayoutSurfaces.ts b/src/features/app/hooks/useMainAppLayoutSurfaces.ts index b6ac05279..5d05ec042 100644 --- a/src/features/app/hooks/useMainAppLayoutSurfaces.ts +++ b/src/features/app/hooks/useMainAppLayoutSurfaces.ts @@ -443,6 +443,7 @@ function buildPrimarySurface({ workspacePath: activeWorkspace?.path ?? null, openTargets: appSettings.openAppTargets, selectedOpenAppId: appSettings.selectedOpenAppId, + selectedModelId, codeBlockCopyUseModifier: appSettings.composerCodeBlockCopyUseModifier, showMessageFilePath: appSettings.showMessageFilePath, userInputRequests, diff --git a/src/features/messages/components/MessageRows.tsx b/src/features/messages/components/MessageRows.tsx index 9e2848606..04516db4f 100644 --- a/src/features/messages/components/MessageRows.tsx +++ b/src/features/messages/components/MessageRows.tsx @@ -12,6 +12,7 @@ import Quote from "lucide-react/dist/esm/icons/quote"; import Search from "lucide-react/dist/esm/icons/search"; import Terminal from "lucide-react/dist/esm/icons/terminal"; import Users from "lucide-react/dist/esm/icons/users"; +import Volume2 from "lucide-react/dist/esm/icons/volume-2"; import Wrench from "lucide-react/dist/esm/icons/wrench"; import X from "lucide-react/dist/esm/icons/x"; import { exportMarkdownFile } from "@services/tauri"; @@ -19,6 +20,12 @@ import { pushErrorToast } from "@services/toasts"; import type { ConversationItem } from "../../../types"; import type { ParsedFileLocation } from "../../../utils/fileLinks"; import { PierreDiffBlock } from "../../git/components/PierreDiffBlock"; +import { useMenuController } from "../../app/hooks/useMenuController"; +import { + PopoverMenuItem, + PopoverSurface, +} from "../../design-system/components/popover/PopoverPrimitives"; +import type { MessageAudioState } from "../hooks/useMessageAudio"; import { MAX_COMMAND_OUTPUT_LINES, basename, @@ -60,6 +67,10 @@ type MessageRowProps = MarkdownFileLinkProps & { isCopied: boolean; onCopy: (item: Extract) => void; onQuote?: (item: Extract, selectedText?: string) => void; + audioState?: MessageAudioState; + onListenFull?: (messageId: string, speakableText: string) => void; + onListenSummary?: (messageId: string, responseText: string) => void; + onStopAudio?: (messageId?: string) => void; codeBlockCopyUseModifier?: boolean; }; @@ -99,6 +110,96 @@ type CommandOutputProps = { output: string; }; +const MESSAGE_CONTROL_SELECTOR = + ".message-quote-button, .message-copy-button, .message-audio-button, .message-audio-popover"; + +const BLOCK_TEXT_TAGS = new Set([ + "blockquote", + "div", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "li", + "ol", + "p", + "pre", + "table", + "tbody", + "td", + "th", + "thead", + "tr", + "ul", +]); + +function pushTextBoundary(parts: string[]) { + if (parts.length === 0 || parts[parts.length - 1] === "\n") { + return; + } + parts.push("\n"); +} + +function collectSpeakableText(node: Node, parts: string[]) { + if (node.nodeType === Node.TEXT_NODE) { + const text = node.textContent?.replace(/\s+/g, " ") ?? ""; + if (text.trim()) { + parts.push(text); + } + return; + } + + if (!(node instanceof Element)) { + return; + } + + const tagName = node.tagName.toLowerCase(); + if (tagName === "button") { + return; + } + + if (tagName === "br") { + pushTextBoundary(parts); + return; + } + + const isBlock = BLOCK_TEXT_TAGS.has(tagName); + if (isBlock) { + pushTextBoundary(parts); + } + + node.childNodes.forEach((child) => collectSpeakableText(child, parts)); + + if (isBlock) { + pushTextBoundary(parts); + } +} + +function extractSpeakableMessageText( + bubble: HTMLDivElement | null, + fallbackText: string, +): string { + const markdownRoot = bubble?.querySelector(".markdown"); + if (!markdownRoot) { + return fallbackText.trim(); + } + + const parts: string[] = []; + collectSpeakableText(markdownRoot, parts); + const normalized = parts + .join("") + .replace(/\u00a0/g, " ") + .split("\n") + .map((line) => line.replace(/\s+/g, " ").trim()) + .filter(Boolean) + .join("\n\n") + .trim(); + + return normalized || fallbackText.trim(); +} + const MessageImageGrid = memo(function MessageImageGrid({ images, onOpen, @@ -371,6 +472,10 @@ export const MessageRow = memo(function MessageRow({ isCopied, onCopy, onQuote, + audioState, + onListenFull, + onListenSummary, + onStopAudio, codeBlockCopyUseModifier, showMessageFilePath, workspacePath, @@ -381,6 +486,7 @@ export const MessageRow = memo(function MessageRow({ const [lightboxIndex, setLightboxIndex] = useState(null); const bubbleRef = useRef(null); const selectionSnapshotRef = useRef(null); + const audioMenu = useMenuController(); const hasText = item.text.trim().length > 0; const imageItems = useMemo(() => { if (!item.images || item.images.length === 0) { @@ -401,6 +507,19 @@ export const MessageRow = memo(function MessageRow({ hasText && imageItems.length === 0 && isStandaloneMarkdownTable(item.text); + const resolvedAudioState = + audioState ?? + ({ + isActive: false, + mode: null, + status: "idle", + } satisfies MessageAudioState); + const canPlayAudio = + item.role === "assistant" && + hasText && + Boolean(onListenFull) && + Boolean(onListenSummary) && + Boolean(onStopAudio); const getSelectedMessageText = useCallback(() => { const bubble = bubbleRef.current; @@ -422,7 +541,7 @@ export const MessageRow = memo(function MessageRow({ return false; } const element = node instanceof Element ? node : node.parentElement; - return Boolean(element?.closest(".message-quote-button, .message-copy-button")); + return Boolean(element?.closest(MESSAGE_CONTROL_SELECTOR)); }; if (isWithinMessageControls(selection.anchorNode) || isWithinMessageControls(selection.focusNode)) { @@ -440,6 +559,46 @@ export const MessageRow = memo(function MessageRow({ onQuote(item, selectedText); }, [getSelectedMessageText, item, onQuote]); + const handleToggleAudioMenu = useCallback( + (event: MouseEvent) => { + event.preventDefault(); + event.stopPropagation(); + audioMenu.toggle(); + }, + [audioMenu], + ); + + const handleListenFull = useCallback( + (event: MouseEvent) => { + event.preventDefault(); + event.stopPropagation(); + audioMenu.close(); + const speakableText = extractSpeakableMessageText(bubbleRef.current, item.text); + onListenFull?.(item.id, speakableText); + }, + [audioMenu, item.id, item.text, onListenFull], + ); + + const handleListenSummary = useCallback( + (event: MouseEvent) => { + event.preventDefault(); + event.stopPropagation(); + audioMenu.close(); + onListenSummary?.(item.id, item.text); + }, + [audioMenu, item.id, item.text, onListenSummary], + ); + + const handleStopAudio = useCallback( + (event: MouseEvent) => { + event.preventDefault(); + event.stopPropagation(); + audioMenu.close(); + onStopAudio?.(item.id); + }, + [audioMenu, item.id, onStopAudio], + ); + return (
setLightboxIndex(null)} /> )} - {onQuote && hasText && ( +
+ {canPlayAudio && ( +
+ + {audioMenu.isOpen && ( + + + Listen full + + + Listen summary + + + Stop + + + )} +
+ )} + {onQuote && hasText && ( + + )} - )} - +
); diff --git a/src/features/messages/components/Messages.test.tsx b/src/features/messages/components/Messages.test.tsx index f82e5b803..75d491974 100644 --- a/src/features/messages/components/Messages.test.tsx +++ b/src/features/messages/components/Messages.test.tsx @@ -14,8 +14,9 @@ const useFileLinkOpenerMock = vi.fn( ); const openFileLinkMock = vi.fn(); const showFileLinkMenuMock = vi.fn(); -const { exportMarkdownFileMock } = vi.hoisted(() => ({ +const { exportMarkdownFileMock, generateMessageAudioSummaryMock } = vi.hoisted(() => ({ exportMarkdownFileMock: vi.fn(), + generateMessageAudioSummaryMock: vi.fn(), })); vi.mock("../hooks/useFileLinkOpener", () => ({ @@ -33,10 +34,14 @@ vi.mock("@services/tauri", async () => { return { ...actual, exportMarkdownFile: exportMarkdownFileMock, + generateMessageAudioSummary: generateMessageAudioSummaryMock, }; }); describe("Messages", () => { + let originalSpeechSynthesis: SpeechSynthesis | undefined; + let originalSpeechSynthesisUtterance: typeof SpeechSynthesisUtterance | undefined; + beforeAll(() => { if (!HTMLElement.prototype.scrollIntoView) { HTMLElement.prototype.scrollIntoView = vi.fn(); @@ -52,8 +57,59 @@ describe("Messages", () => { openFileLinkMock.mockReset(); showFileLinkMenuMock.mockReset(); exportMarkdownFileMock.mockReset(); + generateMessageAudioSummaryMock.mockReset(); + originalSpeechSynthesis = window.speechSynthesis; + originalSpeechSynthesisUtterance = globalThis.SpeechSynthesisUtterance; + }); + + afterEach(() => { + Object.defineProperty(window, "speechSynthesis", { + configurable: true, + value: originalSpeechSynthesis, + }); + Object.defineProperty(globalThis, "SpeechSynthesisUtterance", { + configurable: true, + value: originalSpeechSynthesisUtterance, + }); }); + function installSpeechMocks() { + const utterances: Array<{ + text: string; + onend: ((event: Event) => void) | null; + onerror: ((event: Event) => void) | null; + }> = []; + const speak = vi.fn((utterance: { text: string }) => { + utterances.push(utterance as (typeof utterances)[number]); + }); + const cancel = vi.fn(); + + Object.defineProperty(window, "speechSynthesis", { + configurable: true, + value: { + speak, + cancel, + } satisfies Partial, + }); + + class MockSpeechSynthesisUtterance { + text: string; + onend: ((event: Event) => void) | null = null; + onerror: ((event: Event) => void) | null = null; + + constructor(text: string) { + this.text = text; + } + } + + Object.defineProperty(globalThis, "SpeechSynthesisUtterance", { + configurable: true, + value: MockSpeechSynthesisUtterance, + }); + + return { speak, cancel, utterances }; + } + it("renders image grid above message text and opens lightbox", () => { const items: ConversationItem[] = [ { @@ -243,6 +299,116 @@ describe("Messages", () => { selection?.removeAllRanges(); }); + it("reads the rendered assistant response aloud", () => { + const { speak, utterances } = installSpeechMocks(); + const items: ConversationItem[] = [ + { + id: "msg-audio-full-1", + kind: "message", + role: "assistant", + text: "# Heading\n\nParagraph with `code`.", + }, + ]; + + render( + , + ); + + fireEvent.click(screen.getByRole("button", { name: "Response audio" })); + fireEvent.click(screen.getByRole("button", { name: "Listen full" })); + + expect(speak).toHaveBeenCalledTimes(1); + expect(utterances[0]?.text).toContain("Heading"); + + act(() => { + utterances[0]?.onend?.(new Event("end")); + }); + + expect(speak).toHaveBeenCalledTimes(2); + expect(utterances[1]?.text).toContain("Paragraph with code."); + }); + + it("generates and reads a summary with the selected model", async () => { + const { speak, utterances } = installSpeechMocks(); + generateMessageAudioSummaryMock.mockResolvedValueOnce("Short spoken summary"); + const items: ConversationItem[] = [ + { + id: "msg-audio-summary-1", + kind: "message", + role: "assistant", + text: "Long agent response", + }, + ]; + + render( + , + ); + + fireEvent.click(screen.getByRole("button", { name: "Response audio" })); + fireEvent.click(screen.getByRole("button", { name: "Listen summary" })); + + await waitFor(() => { + expect(generateMessageAudioSummaryMock).toHaveBeenCalledWith( + "ws-1", + "Long agent response", + "gpt-5-codex", + ); + }); + await waitFor(() => { + expect(speak).toHaveBeenCalledTimes(1); + }); + expect(utterances[0]?.text).toBe("Short spoken summary"); + }); + + it("stops the active spoken response from the message menu", async () => { + const { cancel } = installSpeechMocks(); + const items: ConversationItem[] = [ + { + id: "msg-audio-stop-1", + kind: "message", + role: "assistant", + text: "Hello world", + }, + ]; + + render( + , + ); + + fireEvent.click(screen.getByRole("button", { name: "Response audio" })); + fireEvent.click(screen.getByRole("button", { name: "Listen full" })); + expect(cancel).toHaveBeenCalledTimes(1); + + fireEvent.click(screen.getByRole("button", { name: "Response audio" })); + fireEvent.click(screen.getByRole("button", { name: "Stop" })); + + await waitFor(() => { + expect(cancel).toHaveBeenCalledTimes(2); + }); + }); + it("opens linked review thread when clicking thread link", () => { const onOpenThreadLink = vi.fn(); const items: ConversationItem[] = [ diff --git a/src/features/messages/components/Messages.tsx b/src/features/messages/components/Messages.tsx index da2a318e1..52512618a 100644 --- a/src/features/messages/components/Messages.tsx +++ b/src/features/messages/components/Messages.tsx @@ -10,6 +10,7 @@ import type { import { PlanReadyFollowupMessage } from "../../app/components/PlanReadyFollowupMessage"; import { RequestUserInputMessage } from "../../app/components/RequestUserInputMessage"; import { useFileLinkOpener } from "../hooks/useFileLinkOpener"; +import { useMessageAudio } from "../hooks/useMessageAudio"; import { formatCount, parseReasoning } from "../utils/messageRenderUtils"; import { DiffRow, @@ -36,6 +37,7 @@ type MessagesProps = { workspacePath?: string | null; openTargets: OpenAppTarget[]; selectedOpenAppId: string; + selectedModelId?: string | null; codeBlockCopyUseModifier?: boolean; showMessageFilePath?: boolean; userInputRequests?: RequestUserInputRequest[]; @@ -62,6 +64,7 @@ export const Messages = memo(function Messages({ workspacePath = null, openTargets, selectedOpenAppId, + selectedModelId = null, codeBlockCopyUseModifier = false, showMessageFilePath = true, userInputRequests = [], @@ -84,6 +87,16 @@ export const Messages = memo(function Messages({ openTargets, selectedOpenAppId, ); + const { + getMessageAudioState, + listenToMessage, + listenToMessageSummary, + stopMessageAudio, + } = useMessageAudio({ + workspaceId, + threadId, + selectedModelId, + }); const handleOpenThreadLink = useCallback( (threadId: string) => { onOpenThreadLink?.(threadId, workspaceId ?? null); @@ -160,6 +173,10 @@ export const Messages = memo(function Messages({ onOpenFileLink={openFileLink} onOpenFileLinkMenu={showFileLinkMenu} onOpenThreadLink={handleOpenThreadLink} + audioState={getMessageAudioState(item.id)} + onListenFull={listenToMessage} + onListenSummary={listenToMessageSummary} + onStopAudio={stopMessageAudio} /> ); } diff --git a/src/features/messages/hooks/useMessageAudio.test.tsx b/src/features/messages/hooks/useMessageAudio.test.tsx new file mode 100644 index 000000000..2956466c5 --- /dev/null +++ b/src/features/messages/hooks/useMessageAudio.test.tsx @@ -0,0 +1,215 @@ +// @vitest-environment jsdom +import { act, renderHook } from "@testing-library/react"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { generateMessageAudioSummary } from "@services/tauri"; +import { pushErrorToast } from "@services/toasts"; +import { useMessageAudio } from "./useMessageAudio"; + +vi.mock("@services/tauri", () => ({ + generateMessageAudioSummary: vi.fn(), +})); + +vi.mock("@services/toasts", () => ({ + pushErrorToast: vi.fn(), +})); + +type MockUtteranceInstance = { + text: string; + onend: ((event: Event) => void) | null; + onerror: ((event: Event) => void) | null; +}; + +describe("useMessageAudio", () => { + let speakMock: ReturnType; + let cancelMock: ReturnType; + let utterances: MockUtteranceInstance[]; + let originalSpeechSynthesis: SpeechSynthesis | undefined; + let originalSpeechSynthesisUtterance: typeof SpeechSynthesisUtterance | undefined; + + beforeEach(() => { + vi.clearAllMocks(); + utterances = []; + speakMock = vi.fn((utterance: MockUtteranceInstance) => { + utterances.push(utterance); + }); + cancelMock = vi.fn(); + originalSpeechSynthesis = window.speechSynthesis; + originalSpeechSynthesisUtterance = globalThis.SpeechSynthesisUtterance; + + Object.defineProperty(window, "speechSynthesis", { + configurable: true, + value: { + speak: speakMock, + cancel: cancelMock, + } satisfies Partial, + }); + + class MockSpeechSynthesisUtterance { + text: string; + onend: ((event: Event) => void) | null = null; + onerror: ((event: Event) => void) | null = null; + + constructor(text: string) { + this.text = text; + } + } + + Object.defineProperty(globalThis, "SpeechSynthesisUtterance", { + configurable: true, + value: MockSpeechSynthesisUtterance, + }); + }); + + afterEach(() => { + Object.defineProperty(window, "speechSynthesis", { + configurable: true, + value: originalSpeechSynthesis, + }); + Object.defineProperty(globalThis, "SpeechSynthesisUtterance", { + configurable: true, + value: originalSpeechSynthesisUtterance, + }); + }); + + it("speaks full responses and clears active state when playback ends", () => { + const { result } = renderHook(() => + useMessageAudio({ + workspaceId: "ws-1", + threadId: "thread-1", + selectedModelId: "gpt-5-codex", + }), + ); + + act(() => { + result.current.listenToMessage("msg-1", "Hello world"); + }); + + expect(cancelMock).toHaveBeenCalledTimes(1); + expect(speakMock).toHaveBeenCalledTimes(1); + expect(utterances[0]?.text).toBe("Hello world"); + expect(result.current.getMessageAudioState("msg-1")).toEqual({ + isActive: true, + mode: "full", + status: "speaking", + }); + + act(() => { + utterances[0]?.onend?.(new Event("end")); + }); + + expect(result.current.getMessageAudioState("msg-1")).toEqual({ + isActive: false, + mode: null, + status: "idle", + }); + }); + + it("generates and caches summaries with the selected model", async () => { + let resolveSummary: ((value: string) => void) | null = null; + const summaryPromise = new Promise((resolve) => { + resolveSummary = resolve; + }); + vi.mocked(generateMessageAudioSummary).mockReturnValueOnce(summaryPromise); + + const { result } = renderHook(() => + useMessageAudio({ + workspaceId: "ws-1", + threadId: "thread-1", + selectedModelId: "gpt-5-codex", + }), + ); + + act(() => { + void result.current.listenToMessageSummary("msg-1", "Long agent response"); + }); + + expect(result.current.getMessageAudioState("msg-1")).toEqual({ + isActive: true, + mode: "summary", + status: "preparing", + }); + + await act(async () => { + resolveSummary?.("Short spoken summary"); + await summaryPromise; + }); + + expect(generateMessageAudioSummary).toHaveBeenCalledWith( + "ws-1", + "Long agent response", + "gpt-5-codex", + ); + expect(utterances[utterances.length - 1]?.text).toBe("Short spoken summary"); + + act(() => { + result.current.stopMessageAudio("msg-1"); + }); + + await act(async () => { + await result.current.listenToMessageSummary("msg-1", "Long agent response"); + }); + + expect(generateMessageAudioSummary).toHaveBeenCalledTimes(1); + expect(utterances[utterances.length - 1]?.text).toBe("Short spoken summary"); + }); + + it("cancels active playback when the thread changes", () => { + const { result, rerender } = renderHook( + (props: { workspaceId: string | null; threadId: string | null }) => + useMessageAudio({ + ...props, + selectedModelId: "gpt-5-codex", + }), + { + initialProps: { + workspaceId: "ws-1", + threadId: "thread-1", + }, + }, + ); + + act(() => { + result.current.listenToMessage("msg-1", "Hello world"); + }); + + rerender({ + workspaceId: "ws-1", + threadId: "thread-2", + }); + + expect(cancelMock).toHaveBeenCalledTimes(2); + expect(result.current.getMessageAudioState("msg-1")).toEqual({ + isActive: false, + mode: null, + status: "idle", + }); + }); + + it("surfaces an error toast when speech synthesis is unavailable", () => { + Object.defineProperty(window, "speechSynthesis", { + configurable: true, + value: undefined, + }); + Object.defineProperty(globalThis, "SpeechSynthesisUtterance", { + configurable: true, + value: undefined, + }); + + const { result } = renderHook(() => + useMessageAudio({ + workspaceId: "ws-1", + threadId: "thread-1", + selectedModelId: "gpt-5-codex", + }), + ); + + act(() => { + result.current.listenToMessage("msg-1", "Hello world"); + }); + + expect(pushErrorToast).toHaveBeenCalledWith({ + title: "Audio playback unavailable", + message: "This environment does not support spoken response playback.", + }); + }); +}); diff --git a/src/features/messages/hooks/useMessageAudio.ts b/src/features/messages/hooks/useMessageAudio.ts new file mode 100644 index 000000000..3aceef336 --- /dev/null +++ b/src/features/messages/hooks/useMessageAudio.ts @@ -0,0 +1,388 @@ +import { useCallback, useEffect, useRef, useState } from "react"; +import { generateMessageAudioSummary } from "@services/tauri"; +import { pushErrorToast } from "@services/toasts"; + +export type MessageAudioMode = "full" | "summary"; +export type MessageAudioStatus = "idle" | "preparing" | "speaking"; + +type ActivePlayback = { + messageId: string | null; + mode: MessageAudioMode | null; + status: MessageAudioStatus; +}; + +export type MessageAudioState = { + isActive: boolean; + mode: MessageAudioMode | null; + status: MessageAudioStatus; +}; + +type UseMessageAudioArgs = { + workspaceId: string | null; + threadId: string | null; + selectedModelId?: string | null; +}; + +const MAX_SPEECH_CHUNK_LENGTH = 900; +const IDLE_PLAYBACK: ActivePlayback = { + messageId: null, + mode: null, + status: "idle", +}; + +function resolveSpeechSynthesis(): SpeechSynthesis | null { + if (typeof window === "undefined") { + return null; + } + if (typeof SpeechSynthesisUtterance !== "function") { + return null; + } + return window.speechSynthesis ?? null; +} + +function normalizeSpeechText(text: string): string { + return text + .replace(/\r\n?/g, "\n") + .replace(/\u00a0/g, " ") + .split("\n") + .map((line) => line.trim()) + .join("\n") + .replace(/\n{3,}/g, "\n\n") + .trim(); +} + +function splitWordsToMaxLength(text: string, maxChunkLength: number): string[] { + const words = text.trim().split(/\s+/).filter(Boolean); + if (words.length === 0) { + return []; + } + + const chunks: string[] = []; + let current = ""; + for (const word of words) { + const candidate = current ? `${current} ${word}` : word; + if (candidate.length > maxChunkLength && current) { + chunks.push(current); + current = word; + continue; + } + current = candidate; + } + if (current) { + chunks.push(current); + } + return chunks; +} + +function splitParagraphToChunks(paragraph: string, maxChunkLength: number): string[] { + if (paragraph.length <= maxChunkLength) { + return [paragraph]; + } + + const sentences = paragraph + .split(/(?<=[.!?])\s+/) + .map((entry) => entry.trim()) + .filter(Boolean); + + if (sentences.length <= 1) { + return splitWordsToMaxLength(paragraph, maxChunkLength); + } + + const chunks: string[] = []; + let current = ""; + for (const sentence of sentences) { + if (sentence.length > maxChunkLength) { + if (current) { + chunks.push(current); + current = ""; + } + chunks.push(...splitWordsToMaxLength(sentence, maxChunkLength)); + continue; + } + + const candidate = current ? `${current} ${sentence}` : sentence; + if (candidate.length > maxChunkLength && current) { + chunks.push(current); + current = sentence; + continue; + } + current = candidate; + } + if (current) { + chunks.push(current); + } + return chunks; +} + +export function splitSpeechText( + text: string, + maxChunkLength: number = MAX_SPEECH_CHUNK_LENGTH, +): string[] { + const normalized = normalizeSpeechText(text); + if (!normalized) { + return []; + } + + return normalized + .split(/\n{2,}/) + .map((paragraph) => paragraph.trim()) + .filter(Boolean) + .flatMap((paragraph) => splitParagraphToChunks(paragraph, maxChunkLength)); +} + +function buildSummaryCacheKey({ + workspaceId, + threadId, + messageId, + modelId, +}: { + workspaceId: string | null; + threadId: string | null; + messageId: string; + modelId: string | null | undefined; +}) { + return [workspaceId ?? "no-workspace", threadId ?? "no-thread", messageId, modelId ?? "default"] + .join("::"); +} + +export function useMessageAudio({ + workspaceId, + threadId, + selectedModelId = null, +}: UseMessageAudioArgs) { + const [playback, setPlayback] = useState(IDLE_PLAYBACK); + const playbackRef = useRef(playback); + const requestTokenRef = useRef(0); + const summaryCacheRef = useRef>(new Map()); + const previousWorkspaceIdRef = useRef(workspaceId); + const previousThreadIdRef = useRef(threadId); + + useEffect(() => { + playbackRef.current = playback; + }, [playback]); + + const setIdlePlayback = useCallback(() => { + setPlayback(IDLE_PLAYBACK); + }, []); + + const cancelSpeech = useCallback(() => { + const synthesis = resolveSpeechSynthesis(); + if (!synthesis) { + return; + } + try { + synthesis.cancel(); + } catch { + // Some runtimes can throw here; cancelation is best effort. + } + }, []); + + const cancelActivePlayback = useCallback(() => { + requestTokenRef.current += 1; + cancelSpeech(); + setIdlePlayback(); + }, [cancelSpeech, setIdlePlayback]); + + const reportUnavailable = useCallback(() => { + pushErrorToast({ + title: "Audio playback unavailable", + message: "This environment does not support spoken response playback.", + }); + }, []); + + const startSpeaking = useCallback( + (messageId: string, mode: MessageAudioMode, text: string, token: number) => { + const synthesis = resolveSpeechSynthesis(); + if (!synthesis) { + reportUnavailable(); + setIdlePlayback(); + return; + } + + const chunks = splitSpeechText(text); + if (chunks.length === 0) { + setIdlePlayback(); + return; + } + + setPlayback({ + messageId, + mode, + status: "speaking", + }); + + const speakChunk = (index: number) => { + if (requestTokenRef.current !== token) { + return; + } + + if (index >= chunks.length) { + setIdlePlayback(); + return; + } + + const utterance = new SpeechSynthesisUtterance(chunks[index]); + utterance.onend = () => { + if (requestTokenRef.current !== token) { + return; + } + speakChunk(index + 1); + }; + utterance.onerror = () => { + if (requestTokenRef.current !== token) { + return; + } + pushErrorToast({ + title: "Audio playback failed", + message: "The spoken response could not be played.", + }); + setIdlePlayback(); + }; + + try { + synthesis.speak(utterance); + } catch (error) { + const message = + error instanceof Error ? error.message : "The spoken response could not be played."; + pushErrorToast({ + title: "Audio playback failed", + message, + }); + setIdlePlayback(); + } + }; + + speakChunk(0); + }, + [reportUnavailable, setIdlePlayback], + ); + + const listenToMessage = useCallback( + (messageId: string, speakableText: string) => { + const token = requestTokenRef.current + 1; + requestTokenRef.current = token; + cancelSpeech(); + startSpeaking(messageId, "full", speakableText, token); + }, + [cancelSpeech, startSpeaking], + ); + + const listenToMessageSummary = useCallback( + async (messageId: string, responseText: string) => { + if (!workspaceId) { + pushErrorToast({ + title: "Response summary unavailable", + message: "A workspace must be active before generating a response summary.", + }); + return; + } + + const token = requestTokenRef.current + 1; + requestTokenRef.current = token; + cancelSpeech(); + setPlayback({ + messageId, + mode: "summary", + status: "preparing", + }); + + const cacheKey = buildSummaryCacheKey({ + workspaceId, + threadId, + messageId, + modelId: selectedModelId, + }); + const cachedSummary = summaryCacheRef.current.get(cacheKey); + if (cachedSummary) { + startSpeaking(messageId, "summary", cachedSummary, token); + return; + } + + try { + const summary = await generateMessageAudioSummary( + workspaceId, + responseText, + selectedModelId, + ); + if (requestTokenRef.current !== token) { + return; + } + summaryCacheRef.current.set(cacheKey, summary); + startSpeaking(messageId, "summary", summary, token); + } catch (error) { + if (requestTokenRef.current !== token) { + return; + } + const message = + error instanceof Error ? error.message : "The response summary could not be generated."; + pushErrorToast({ + title: "Response summary failed", + message, + }); + setIdlePlayback(); + } + }, + [ + cancelSpeech, + selectedModelId, + setIdlePlayback, + startSpeaking, + threadId, + workspaceId, + ], + ); + + const stopMessageAudio = useCallback( + (messageId?: string) => { + if (messageId && playbackRef.current.messageId !== messageId) { + return; + } + cancelActivePlayback(); + }, + [cancelActivePlayback], + ); + + useEffect(() => { + const workspaceChanged = previousWorkspaceIdRef.current !== workspaceId; + const threadChanged = previousThreadIdRef.current !== threadId; + previousWorkspaceIdRef.current = workspaceId; + previousThreadIdRef.current = threadId; + + if (workspaceChanged || threadChanged) { + cancelActivePlayback(); + } + }, [cancelActivePlayback, threadId, workspaceId]); + + useEffect( + () => () => { + requestTokenRef.current += 1; + cancelSpeech(); + }, + [cancelSpeech], + ); + + const getMessageAudioState = useCallback( + (messageId: string): MessageAudioState => { + if (playback.messageId !== messageId) { + return { + isActive: false, + mode: null, + status: "idle", + }; + } + return { + isActive: true, + mode: playback.mode, + status: playback.status, + }; + }, + [playback], + ); + + return { + getMessageAudioState, + listenToMessage, + listenToMessageSummary, + stopMessageAudio, + }; +} diff --git a/src/services/tauri.test.ts b/src/services/tauri.test.ts index eb411228a..933707946 100644 --- a/src/services/tauri.test.ts +++ b/src/services/tauri.test.ts @@ -51,6 +51,7 @@ import { readAgentConfigToml, readImageAsDataUrl, generateAgentDescription, + generateMessageAudioSummary, writeAgentConfigToml, writeAgentMd, } from "./tauri"; @@ -713,6 +714,23 @@ describe("tauri invoke wrappers", () => { }); }); + it("generates a hidden audio summary for a response", async () => { + const invokeMock = vi.mocked(invoke); + invokeMock.mockResolvedValueOnce("Updated src/App.tsx and ran npm run test."); + + await generateMessageAudioSummary( + "ws-agent", + "Updated `src/App.tsx` and ran `npm run test`.", + "gpt-5-codex", + ); + + expect(invokeMock).toHaveBeenCalledWith("generate_message_audio_summary", { + workspaceId: "ws-agent", + responseText: "Updated `src/App.tsx` and ran `npm run test`.", + modelId: "gpt-5-codex", + }); + }); + it("fills sendUserMessage defaults in payload", async () => { const invokeMock = vi.mocked(invoke); invokeMock.mockResolvedValueOnce({}); diff --git a/src/services/tauri.ts b/src/services/tauri.ts index 029e0d31b..841e7cb20 100644 --- a/src/services/tauri.ts +++ b/src/services/tauri.ts @@ -1115,6 +1115,18 @@ export async function generateAgentDescription( return invoke("generate_agent_description", { workspaceId, description }); } +export async function generateMessageAudioSummary( + workspaceId: string, + responseText: string, + modelId?: string | null, +): Promise { + return invoke("generate_message_audio_summary", { + workspaceId, + responseText, + modelId: modelId ?? null, + }); +} + export type AppBuildType = "debug" | "release"; export async function getAppBuildType(): Promise { diff --git a/src/styles/messages.css b/src/styles/messages.css index 795066974..7d0acd438 100644 --- a/src/styles/messages.css +++ b/src/styles/messages.css @@ -198,44 +198,63 @@ margin: 0; } -.message-copy-button { - display: inline-flex; - align-items: center; - justify-content: center; +.message-bubble-actions { position: absolute; right: 6px; bottom: -12px; - padding: 4px; - border-radius: 999px; - background: var(--surface-card-strong); - border: 1px solid var(--border-strong); + display: inline-flex; + align-items: center; + gap: 6px; opacity: 0; transform: translateY(4px); transition: opacity 160ms ease, transform 160ms ease; + z-index: 2; } -.message-quote-button { +.message-copy-button, +.message-quote-button, +.message-audio-button { display: inline-flex; align-items: center; justify-content: center; - position: absolute; - right: 34px; - bottom: -12px; padding: 4px; border-radius: 999px; background: var(--surface-card-strong); border: 1px solid var(--border-strong); - opacity: 0; - transform: translateY(4px); - transition: opacity 160ms ease, transform 160ms ease; } -.message:hover .message-copy-button, -.message:hover .message-quote-button { +.message:hover .message-bubble-actions, +.message:focus-within .message-bubble-actions, +.message-bubble-actions.is-active { opacity: 1; transform: translateY(0); } +.message-audio-menu-wrap { + position: relative; + display: inline-flex; +} + +.message-audio-button.is-active { + color: var(--text-accent); + border-color: color-mix(in srgb, var(--border-accent) 48%, var(--border-strong)); +} + +.message-audio-button.is-busy { + animation: message-audio-pulse 1.1s ease-in-out infinite; +} + +.message-audio-popover { + position: absolute; + right: 0; + bottom: calc(100% + 8px); + min-width: 11rem; + padding: 6px; + display: grid; + gap: 4px; + z-index: 3; +} + .message-copy-icon { position: relative; width: 14px; @@ -276,6 +295,16 @@ filter: blur(0); } +@keyframes message-audio-pulse { + 0%, + 100% { + transform: scale(1); + } + 50% { + transform: scale(1.06); + } +} + .message.user .bubble { max-width: min(72%, 560px); background: var(--cm-surface-panel-active);