From 61315a75ce2a3687ea568ea4e4e9150fd91d489e Mon Sep 17 00:00:00 2001 From: tianhei Date: Tue, 17 Mar 2026 19:06:15 +0800 Subject: [PATCH 01/22] feat(tts): add external provider fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: - add an OpenAI-compatible TTS adapter with request tests, /v1 path fallback, and non-audio response detection - add a legacy external TTS fallback provider and route all TTS generation through a single provider entrypoint - document the unattended design and implementation plan for TIA-51 and add environment keys for external TTS config Rationale: - the existing route hardcoded one anonymous upstream and ignored the voice parameter, so it could not meaningfully expand external TTS support - the configured API endpoint in this environment exposes chat models but does not provide a standard TTS audio endpoint, so runtime fallback is required - keeping the response contract as { audio } preserves the editor/media insertion flow while making provider behavior testable and replaceable Tests: - bun test apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/provider.test.ts - bunx @biomejs/biome check apps/web/src/app/api/tts/generate/route.ts apps/web/src/lib/tts/openai-compatible.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/provider.ts apps/web/src/lib/tts/provider.test.ts apps/web/src/lib/tts/legacy.ts apps/web/src/constants/tts-constants.ts packages/env/src/web.ts - bunx tsc -p apps/web/tsconfig.json --noEmit - NODE_ENV=development NEXT_PUBLIC_SITE_URL=http://localhost:4100 UPSTASH_REDIS_REST_URL=https://example.com UPSTASH_REDIS_REST_TOKEN=dummy bun --eval 'import { NextRequest } from "next/server"; import { POST } from "./src/app/api/tts/generate/route.ts"; const request = new NextRequest("http://localhost/api/tts/generate", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ text: "你好,Cutia。", voice: "default" }) }); const response = await POST(request); const json = await response.json(); console.log(JSON.stringify({ status: response.status, audioLength: json.audio?.length ?? 0, audioHead: json.audio?.slice(0, 8) ?? null, error: json.error ?? null }));' (workdir: apps/web) Co-authored-by: Codex --- apps/web/src/app/api/tts/generate/route.ts | 58 +++--- apps/web/src/constants/tts-constants.ts | 5 +- apps/web/src/lib/tts/legacy.ts | 55 ++++++ .../web/src/lib/tts/openai-compatible.test.ts | 164 +++++++++++++++++ apps/web/src/lib/tts/openai-compatible.ts | 169 ++++++++++++++++++ apps/web/src/lib/tts/provider.test.ts | 53 ++++++ apps/web/src/lib/tts/provider.ts | 32 ++++ ...2026-03-17-tts-external-provider-design.md | 132 ++++++++++++++ .../plans/2026-03-17-tts-external-provider.md | 147 +++++++++++++++ packages/env/src/web.ts | 3 + 10 files changed, 778 insertions(+), 40 deletions(-) create mode 100644 apps/web/src/lib/tts/legacy.ts create mode 100644 apps/web/src/lib/tts/openai-compatible.test.ts create mode 100644 apps/web/src/lib/tts/openai-compatible.ts create mode 100644 apps/web/src/lib/tts/provider.test.ts create mode 100644 apps/web/src/lib/tts/provider.ts create mode 100644 docs/plans/2026-03-17-tts-external-provider-design.md create mode 100644 docs/plans/2026-03-17-tts-external-provider.md diff --git a/apps/web/src/app/api/tts/generate/route.ts b/apps/web/src/app/api/tts/generate/route.ts index 6767f75..9155317 100644 --- a/apps/web/src/app/api/tts/generate/route.ts +++ b/apps/web/src/app/api/tts/generate/route.ts @@ -1,18 +1,13 @@ +import { webEnv } from "@cutia/env/web"; import { type NextRequest, NextResponse } from "next/server"; import { z } from "zod"; - -const TTS_API_BASE = "https://api.milorapart.top/apis/mbAIsc"; +import { synthesizeSpeechWithFallback } from "@/lib/tts/provider"; const requestSchema = z.object({ text: z.string().min(1, "Text is required").max(2000, "Text too long"), voice: z.string().optional(), }); -const upstreamResponseSchema = z.object({ - code: z.number(), - url: z.string().url(), -}); - export async function POST(request: NextRequest) { try { const body = await request.json(); @@ -28,42 +23,31 @@ export async function POST(request: NextRequest) { ); } - const { text } = validation.data; - const upstreamUrl = `${TTS_API_BASE}?${new URLSearchParams({ text, format: "mp3" })}`; - const upstreamResponse = await fetch(upstreamUrl); - - if (!upstreamResponse.ok) { - return NextResponse.json( - { error: `Upstream error: ${upstreamResponse.status}` }, - { status: 502 }, - ); - } - - const upstreamData = await upstreamResponse.json(); - const parsed = upstreamResponseSchema.safeParse(upstreamData); - - if (!parsed.success || parsed.data.code !== 200) { - return NextResponse.json( - { error: "TTS generation failed" }, - { status: 502 }, - ); - } - - const audioResponse = await fetch(parsed.data.url); - if (!audioResponse.ok) { - return NextResponse.json( - { error: `Failed to download audio: ${audioResponse.status}` }, - { status: 502 }, - ); - } - - const audioArrayBuffer = await audioResponse.arrayBuffer(); + const { text, voice } = validation.data; + const audioArrayBuffer = await synthesizeSpeechWithFallback({ + env: webEnv, + text, + voice, + }); const base64 = Buffer.from(audioArrayBuffer).toString("base64"); return NextResponse.json({ audio: base64 }); } catch (error) { const message = error instanceof Error ? error.message : "Unknown error"; console.error("TTS generate error:", error); + + if (message === "External TTS is not configured") { + return NextResponse.json({ error: message }, { status: 500 }); + } + + if ( + message.startsWith("External TTS request failed:") || + message === "External TTS returned empty audio" || + message.startsWith("Legacy TTS ") + ) { + return NextResponse.json({ error: message }, { status: 502 }); + } + return NextResponse.json( { error: "Internal server error", detail: message }, { status: 500 }, diff --git a/apps/web/src/constants/tts-constants.ts b/apps/web/src/constants/tts-constants.ts index 60c4084..5b27045 100644 --- a/apps/web/src/constants/tts-constants.ts +++ b/apps/web/src/constants/tts-constants.ts @@ -3,8 +3,7 @@ export interface VoicePack { name: string; } -export const VOICE_PACKS: VoicePack[] = [ - { id: "default", name: "Default" }, -]; +export const VOICE_PACKS: VoicePack[] = [{ id: "default", name: "Default" }]; export const DEFAULT_VOICE_PACK = "default"; +export const DEFAULT_EXTERNAL_TTS_VOICE = "alloy"; diff --git a/apps/web/src/lib/tts/legacy.ts b/apps/web/src/lib/tts/legacy.ts new file mode 100644 index 0000000..15243f2 --- /dev/null +++ b/apps/web/src/lib/tts/legacy.ts @@ -0,0 +1,55 @@ +import { z } from "zod"; + +const LEGACY_TTS_API_BASE = "https://api.milorapart.top/apis/mbAIsc"; + +const legacyResponseSchema = z.object({ + code: z.number(), + url: z.string().url(), +}); + +type FetchLike = ( + input: RequestInfo | URL, + init?: RequestInit, +) => Promise; + +export async function synthesizeSpeechWithLegacyProvider({ + text, + fetchImpl = fetch, +}: { + text: string; + voice?: string; + fetchImpl?: FetchLike; +}): Promise { + const upstreamUrl = `${LEGACY_TTS_API_BASE}?${new URLSearchParams({ + format: "mp3", + text, + })}`; + const upstreamResponse = await fetchImpl(upstreamUrl); + + if (!upstreamResponse.ok) { + throw new Error(`Legacy TTS request failed: ${upstreamResponse.status}`); + } + + const upstreamJson = await upstreamResponse.json().catch(() => null); + const parsed = legacyResponseSchema.safeParse(upstreamJson); + + if (!parsed.success || parsed.data.code !== 200) { + throw new Error("Legacy TTS generation failed"); + } + + const audioResponse = await fetchImpl(parsed.data.url); + + if (!audioResponse.ok) { + throw new Error( + `Legacy TTS audio download failed: ${audioResponse.status}`, + ); + } + + const audio = await audioResponse.arrayBuffer(); + + if (audio.byteLength === 0) { + throw new Error("Legacy TTS returned empty audio"); + } + + return audio; +} diff --git a/apps/web/src/lib/tts/openai-compatible.test.ts b/apps/web/src/lib/tts/openai-compatible.test.ts new file mode 100644 index 0000000..4769b48 --- /dev/null +++ b/apps/web/src/lib/tts/openai-compatible.test.ts @@ -0,0 +1,164 @@ +import { describe, expect, test } from "bun:test"; +import { + DEFAULT_EXTERNAL_TTS_VOICE, + getExternalTtsConfig, + synthesizeSpeechWithOpenAiCompatible, +} from "./openai-compatible"; + +describe("getExternalTtsConfig", () => { + test("reads required config from environment", () => { + const config = getExternalTtsConfig({ + env: { + API_BASE_URL: "https://example.com/v1/", + API_MODEL: "tts-1", + API_KEY: "secret", + }, + }); + + expect(config).toEqual({ + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }); + }); + + test("throws a clear error when config is incomplete", () => { + expect(() => + getExternalTtsConfig({ + env: { + API_BASE_URL: "https://example.com/v1", + API_KEY: "secret", + }, + }), + ).toThrow("External TTS is not configured"); + }); +}); + +describe("synthesizeSpeechWithOpenAiCompatible", () => { + test("posts audio speech requests with the mapped default voice", async () => { + const calls: Array<{ input: RequestInfo | URL; init?: RequestInit }> = []; + const audioBytes = Uint8Array.from([1, 2, 3, 4]); + + const result = await synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1/", + apiKey: "secret", + model: "tts-1", + }, + text: "你好,Cutia", + voice: "default", + fetchImpl: async (input, init) => { + calls.push({ input, init }); + return new Response(audioBytes, { + status: 200, + headers: { "Content-Type": "audio/mpeg" }, + }); + }, + }); + + expect(Array.from(new Uint8Array(result))).toEqual([1, 2, 3, 4]); + expect(calls).toHaveLength(1); + expect(calls[0]?.input).toBe("https://example.com/v1/audio/speech"); + + const headers = new Headers(calls[0]?.init?.headers); + expect(headers.get("authorization")).toBe("Bearer secret"); + expect(headers.get("content-type")).toBe("application/json"); + + expect(JSON.parse(String(calls[0]?.init?.body))).toEqual({ + input: "你好,Cutia", + model: "tts-1", + response_format: "mp3", + voice: DEFAULT_EXTERNAL_TTS_VOICE, + }); + }); + + test("surfaces upstream JSON error messages", async () => { + await expect( + synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "nova", + fetchImpl: async () => + Response.json( + { error: { message: "quota exceeded" } }, + { status: 429 }, + ), + }), + ).rejects.toThrow("quota exceeded"); + }); + + test("falls back to the root audio speech path when the v1 path returns 404", async () => { + const calls: string[] = []; + + const audio = await synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "default", + fetchImpl: async (input) => { + const url = String(input); + calls.push(url); + + if (url === "https://example.com/v1/audio/speech") { + return new Response("page not found", { status: 404 }); + } + + return new Response(Uint8Array.from([9, 8, 7]), { + status: 200, + headers: { "Content-Type": "audio/mpeg" }, + }); + }, + }); + + expect(Array.from(new Uint8Array(audio))).toEqual([9, 8, 7]); + expect(calls).toEqual([ + "https://example.com/v1/audio/speech", + "https://example.com/audio/speech", + ]); + }); + + test("rejects non-audio success responses", async () => { + await expect( + synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "default", + fetchImpl: async () => + new Response("", { + status: 200, + headers: { "Content-Type": "text/html; charset=utf-8" }, + }), + }), + ).rejects.toThrow("Expected audio response"); + }); + + test("surfaces upstream text errors when JSON is unavailable", async () => { + await expect( + synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "nova", + fetchImpl: async () => + new Response("gateway timeout", { + status: 504, + headers: { "Content-Type": "text/plain" }, + }), + }), + ).rejects.toThrow("gateway timeout"); + }); +}); diff --git a/apps/web/src/lib/tts/openai-compatible.ts b/apps/web/src/lib/tts/openai-compatible.ts new file mode 100644 index 0000000..8396a1c --- /dev/null +++ b/apps/web/src/lib/tts/openai-compatible.ts @@ -0,0 +1,169 @@ +import { z } from "zod"; +import { + DEFAULT_EXTERNAL_TTS_VOICE, + DEFAULT_VOICE_PACK, +} from "@/constants/tts-constants"; + +const externalTtsConfigSchema = z.object({ + API_BASE_URL: z.string().min(1), + API_MODEL: z.string().min(1), + API_KEY: z.string().min(1), +}); + +export { DEFAULT_EXTERNAL_TTS_VOICE }; + +export interface ExternalTtsConfig { + apiBaseUrl: string; + apiKey: string; + model: string; +} + +type FetchLike = ( + input: RequestInfo | URL, + init?: RequestInit, +) => Promise; + +export function getExternalTtsConfig({ + env, +}: { + env: Record; +}): ExternalTtsConfig { + const parsed = externalTtsConfigSchema.safeParse(env); + + if (!parsed.success) { + throw new Error("External TTS is not configured"); + } + + return { + apiBaseUrl: parsed.data.API_BASE_URL.replace(/\/+$/, ""), + apiKey: parsed.data.API_KEY, + model: parsed.data.API_MODEL, + }; +} + +function resolveVoice({ voice }: { voice?: string }): string { + if (!voice || voice === DEFAULT_VOICE_PACK) { + return DEFAULT_EXTERNAL_TTS_VOICE; + } + + return voice; +} + +async function getUpstreamErrorMessage({ + response, +}: { + response: Response; +}): Promise { + const contentType = response.headers.get("content-type") ?? ""; + + if (contentType.includes("application/json")) { + const json = (await response.json().catch(() => null)) as { + error?: + | string + | { + message?: string; + }; + } | null; + + if (typeof json?.error === "string" && json.error.trim()) { + return json.error; + } + + if ( + typeof json?.error === "object" && + typeof json.error?.message === "string" && + json.error.message.trim() + ) { + return json.error.message; + } + } + + const text = await response.text().catch(() => ""); + if (text.trim()) { + return text; + } + + return String(response.status); +} + +function getSpeechEndpointUrls({ + apiBaseUrl, +}: { + apiBaseUrl: string; +}): string[] { + const normalizedBaseUrl = apiBaseUrl.replace(/\/+$/, ""); + const urls = [`${normalizedBaseUrl}/audio/speech`]; + + if (normalizedBaseUrl.endsWith("/v1")) { + urls.push(`${normalizedBaseUrl.slice(0, -3)}/audio/speech`); + } + + return [...new Set(urls)]; +} + +export async function synthesizeSpeechWithOpenAiCompatible({ + config, + text, + voice, + fetchImpl = fetch, +}: { + config: ExternalTtsConfig; + text: string; + voice?: string; + fetchImpl?: FetchLike; +}): Promise { + const endpointUrls = getSpeechEndpointUrls({ + apiBaseUrl: config.apiBaseUrl, + }); + const requestInit = { + method: "POST", + headers: { + Authorization: `Bearer ${config.apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + input: text, + model: config.model, + response_format: "mp3", + voice: resolveVoice({ voice }), + }), + } satisfies RequestInit; + + let lastErrorResponse: Response | null = null; + + for (const endpointUrl of endpointUrls) { + const response = await fetchImpl(endpointUrl, requestInit); + + if (response.ok) { + const contentType = response.headers.get("content-type") ?? ""; + + if ( + contentType && + !contentType.includes("audio/") && + contentType !== "application/octet-stream" + ) { + throw new Error(`Expected audio response, received ${contentType}`); + } + + const audio = await response.arrayBuffer(); + + if (audio.byteLength === 0) { + throw new Error("External TTS returned empty audio"); + } + + return audio; + } + + lastErrorResponse = response; + + if (response.status !== 404) { + break; + } + } + + throw new Error( + `External TTS request failed: ${await getUpstreamErrorMessage({ + response: lastErrorResponse ?? new Response(null, { status: 500 }), + })}`, + ); +} diff --git a/apps/web/src/lib/tts/provider.test.ts b/apps/web/src/lib/tts/provider.test.ts new file mode 100644 index 0000000..5b6f6d7 --- /dev/null +++ b/apps/web/src/lib/tts/provider.test.ts @@ -0,0 +1,53 @@ +import { describe, expect, test } from "bun:test"; +import { synthesizeSpeechWithFallback } from "./provider"; + +describe("synthesizeSpeechWithFallback", () => { + test("returns the configured external provider result when it succeeds", async () => { + let legacyCalled = false; + + const result = await synthesizeSpeechWithFallback({ + env: { + API_BASE_URL: "https://example.com/v1", + API_MODEL: "tts-1", + API_KEY: "secret", + }, + text: "hello", + voice: "default", + openAiSynthesize: async () => Uint8Array.from([1, 2, 3]).buffer, + legacySynthesize: async () => { + legacyCalled = true; + return Uint8Array.from([9, 9, 9]).buffer; + }, + }); + + expect(Array.from(new Uint8Array(result))).toEqual([1, 2, 3]); + expect(legacyCalled).toBe(false); + }); + + test("falls back to the legacy provider when the configured provider is unsupported", async () => { + let legacyCalled = false; + + const result = await synthesizeSpeechWithFallback({ + env: { + API_BASE_URL: "https://example.com/v1", + API_MODEL: "tts-1", + API_KEY: "secret", + }, + text: "hello", + voice: "default", + openAiSynthesize: async () => { + throw new Error( + "External TTS request failed: Expected audio response, received text/html; charset=utf-8", + ); + }, + legacySynthesize: async ({ text }) => { + legacyCalled = true; + expect(text).toBe("hello"); + return Uint8Array.from([7, 8, 9]).buffer; + }, + }); + + expect(Array.from(new Uint8Array(result))).toEqual([7, 8, 9]); + expect(legacyCalled).toBe(true); + }); +}); diff --git a/apps/web/src/lib/tts/provider.ts b/apps/web/src/lib/tts/provider.ts new file mode 100644 index 0000000..5b2c95a --- /dev/null +++ b/apps/web/src/lib/tts/provider.ts @@ -0,0 +1,32 @@ +import { + getExternalTtsConfig, + synthesizeSpeechWithOpenAiCompatible, +} from "./openai-compatible"; +import { synthesizeSpeechWithLegacyProvider } from "./legacy"; + +type TtsEnv = { + API_BASE_URL?: string; + API_MODEL?: string; + API_KEY?: string; +}; + +export async function synthesizeSpeechWithFallback({ + env, + text, + voice, + openAiSynthesize = synthesizeSpeechWithOpenAiCompatible, + legacySynthesize = synthesizeSpeechWithLegacyProvider, +}: { + env: TtsEnv; + text: string; + voice?: string; + openAiSynthesize?: typeof synthesizeSpeechWithOpenAiCompatible; + legacySynthesize?: typeof synthesizeSpeechWithLegacyProvider; +}): Promise { + try { + const config = getExternalTtsConfig({ env }); + return await openAiSynthesize({ config, text, voice }); + } catch { + return legacySynthesize({ text, voice }); + } +} diff --git a/docs/plans/2026-03-17-tts-external-provider-design.md b/docs/plans/2026-03-17-tts-external-provider-design.md new file mode 100644 index 0000000..9638aec --- /dev/null +++ b/docs/plans/2026-03-17-tts-external-provider-design.md @@ -0,0 +1,132 @@ +# 外部 TTS 扩展设计 + +## 背景 + +`TIA-51` 要求 Cutia 支持调用外部 TTS API,把文本或对话内容生成语音并接入视频编辑流程。 + +当前仓库已经有一条从文本元素生成语音并插入时间线的链路,但服务端路由 `apps/web/src/app/api/tts/generate/route.ts` 仍然把上游 TTS 服务硬编码为单一匿名接口,`voice` 参数也没有被真正消费。这意味着: + +- 外部 TTS 提供方无法通过环境配置切换 +- 语音选项只是前端占位,实际不会影响生成结果 +- 错误语义受限于硬编码上游,缺少可维护的适配层 + +## 无人值守前提 + +本次执行为无人值守编排,会直接根据工单描述和运行环境中的 `API_BASE_URL`、`API_MODEL`、`API_KEY` 做保守设计,不额外等待人工确认。 + +## 目标 + +- 通过环境变量配置外部 TTS 提供方 +- 支持以文本内容调用外部 TTS API 并返回可插入编辑器的音频 +- 让现有 `voice` 参数真正参与外部请求 +- 为失败场景提供明确、可测试的错误返回 + +## 非目标 + +- 不在本次中引入完整的多供应商设置 UI +- 不改动现有时间线插入和媒体入库的主流程 +- 不为每个第三方 TTS 服务单独做适配器注册中心 + +## 方案比较 + +### 方案 A: 保留当前硬编码上游,只补更多参数 + +优点: +- 改动最少 + +缺点: +- 仍然不满足“外部 TTS 能力扩展”的核心要求 +- 供应商不可配置 +- 无法安全复用运行环境已提供的 API 配置 + +结论:不采用。 + +### 方案 B: 改为环境驱动的 OpenAI 兼容 TTS 适配层 + +优点: +- 只需要一层薄适配,即可支持大量 OpenAI 兼容的 TTS 服务 +- 和当前运行环境提供的 `API_BASE_URL`、`API_MODEL`、`API_KEY` 直接对齐 +- 前端接口保持不变,编辑器链路改动最小 + +缺点: +- 需要定义默认 voice 映射 +- 需要自己处理二进制音频响应与错误解析 + +结论:采用。 + +### 方案 C: 直接搭建供应商注册中心 + +优点: +- 长期扩展性最好 + +缺点: +- 对当前工单明显过度设计 +- 需要更多配置、UI 和测试面 + +结论:当前不采用。 + +## 决策 + +采用方案 B:新增一个面向 OpenAI 兼容接口的 TTS 适配层,服务端路由只负责参数校验、调用适配器并把音频转为前端可消费的 base64。 + +## 架构 + +### 服务端 + +- 在 `apps/web/src/lib/tts/` 下新增可测试的适配模块 +- 模块职责: + - 读取并规范化 TTS 配置 + - 把 `text`/`voice` 转换为上游 `/audio/speech` 请求 + - 解析上游失败响应,输出明确错误 + - 返回音频 `ArrayBuffer` +- 路由只保留: + - 请求体验证 + - 调用适配模块 + - 转 base64 返回 `{ audio }` + +### 前端 + +- `apps/web/src/lib/tts/service.ts` 保持调用 `/api/tts/generate` 的协议不变 +- `apps/web/src/constants/tts-constants.ts` 提供可实际使用的 voice 列表与默认值 +- 文本面板和动作系统继续复用既有插入媒体/时间线逻辑 + +## 数据流 + +1. 用户在文本属性面板或动作系统触发 TTS +2. 前端向 `/api/tts/generate` 提交 `{ text, voice }` +3. 服务端校验参数 +4. 服务端使用环境变量构造对外部 TTS API 的请求 +5. 外部 TTS 返回音频二进制 +6. 服务端转为 base64 JSON 响应 +7. 前端解码为 `Blob` 与 `AudioBuffer` +8. 编辑器把音频加入媒体库并插入音轨 + +## 错误处理 + +- 请求参数非法:返回 `400` +- TTS 环境变量缺失:返回 `500`,信息明确为未配置 +- 外部 TTS 返回非 2xx:返回 `502`,透出可读错误 +- 外部 TTS 返回空音频或异常格式:返回 `502` +- 未知异常:返回 `500` + +## 测试策略 + +### 自动化测试 + +- 为适配层写纯函数测试,覆盖: + - `default` voice 映射 + - 请求 URL、headers、body 是否正确 + - 外部错误 JSON / 文本响应映射 + - 成功时返回音频数据 + +### 真实验证 + +- 使用环境中的真实 `API_BASE_URL`、`API_MODEL`、`API_KEY` +- 直接运行一次服务端适配逻辑,验证能拿到非空 MP3 数据 + +## 风险与缓解 + +- 外部服务不完全兼容 OpenAI TTS 协议 + - 缓解:把适配逻辑集中在单模块,后续改协议只动一处 +- 默认 voice 与实际模型不匹配 + - 缓解:统一在适配层做默认映射,避免前端散落判断 diff --git a/docs/plans/2026-03-17-tts-external-provider.md b/docs/plans/2026-03-17-tts-external-provider.md new file mode 100644 index 0000000..6c81b2d --- /dev/null +++ b/docs/plans/2026-03-17-tts-external-provider.md @@ -0,0 +1,147 @@ +# 外部 TTS 扩展 Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** 让 Cutia 的 TTS 能力从硬编码单一路由改为可配置的外部 TTS API 调用,并继续把生成语音接入媒体库和时间线。 + +**Architecture:** 在 `apps/web/src/lib/tts/` 新增可测试的 OpenAI 兼容 TTS 适配层,`/api/tts/generate` 只负责校验和响应转换,前端调用协议保持 `{ audio }` 不变。通过 `packages/env` 暴露配置,避免把供应商细节散落到 UI 和编辑器逻辑里。 + +**Tech Stack:** Next.js route handlers, TypeScript, Zod, Bun test, OpenAI-compatible HTTP API + +--- + +### Task 1: 补环境与 voice 常量基线 + +**Files:** +- Modify: `packages/env/src/web.ts` +- Modify: `apps/web/src/constants/tts-constants.ts` + +**Step 1: 写出目标测试用例草案** + +- 目标行为: + - TTS 配置可从环境读取 + - `default` voice 会映射到可用的默认外部 voice + +**Step 2: 运行当前目标测试确认缺失** + +Run: `bun test apps/web/src/lib/tts/openai-compatible.test.ts` +Expected: FAIL,原因是测试文件或实现不存在。 + +**Step 3: 为后续实现准备最小配置面** + +- 在环境 schema 中加入 `API_BASE_URL`、`API_MODEL`、`API_KEY` +- 在 TTS 常量中定义默认 voice 与可选 voice 列表 + +**Step 4: 运行定向测试** + +Run: `bun test apps/web/src/lib/tts/openai-compatible.test.ts` +Expected: 仍然失败,但失败点缩小到适配实现缺失。 + +**Step 5: Commit** + +```bash +git add packages/env/src/web.ts apps/web/src/constants/tts-constants.ts +git commit -m "feat: prepare external tts config" +``` + +### Task 2: 先写失败测试覆盖外部 TTS 适配层 + +**Files:** +- Create: `apps/web/src/lib/tts/openai-compatible.test.ts` +- Create: `apps/web/src/lib/tts/openai-compatible.ts` + +**Step 1: 写失败测试** + +- 成功场景:正确构造 `/audio/speech` 请求并返回音频 +- 失败场景:上游 JSON 错误、文本错误、空配置错误 +- voice 场景:`default` 被映射为默认 voice + +**Step 2: 运行测试验证失败** + +Run: `bun test apps/web/src/lib/tts/openai-compatible.test.ts` +Expected: FAIL,且失败原因为导入缺失或行为不匹配,不是测试写错。 + +**Step 3: 写最小实现** + +- 提供配置解析 +- 提供请求构造 +- 提供错误解析 +- 提供音频数组缓冲区返回 + +**Step 4: 运行测试确认通过** + +Run: `bun test apps/web/src/lib/tts/openai-compatible.test.ts` +Expected: PASS + +**Step 5: Commit** + +```bash +git add apps/web/src/lib/tts/openai-compatible.ts apps/web/src/lib/tts/openai-compatible.test.ts +git commit -m "feat: add external tts adapter" +``` + +### Task 3: 接回 API 路由 + +**Files:** +- Modify: `apps/web/src/app/api/tts/generate/route.ts` + +**Step 1: 写失败测试预期** + +- 通过 Task 2 已确保适配层正确 +- 当前路由仍硬编码旧上游,因此与新适配层设计不一致 + +**Step 2: 运行现有测试基线** + +Run: `bun test apps/web/src/lib/tts/openai-compatible.test.ts` +Expected: PASS + +**Step 3: 最小改造路由** + +- 删除硬编码上游 URL 和旧返回结构解析 +- 保留 Zod 请求校验 +- 调用适配层并统一转换为 `{ audio }` + +**Step 4: 运行相关测试** + +Run: `bun test apps/web/src/lib/tts/openai-compatible.test.ts` +Expected: PASS + +**Step 5: Commit** + +```bash +git add apps/web/src/app/api/tts/generate/route.ts +git commit -m "feat: wire route to external tts provider" +``` + +### Task 4: 端到端验证与整理 + +**Files:** +- Modify: `docs/plans/2026-03-17-tts-external-provider-design.md` +- Modify: `docs/plans/2026-03-17-tts-external-provider.md` + +**Step 1: 运行自动化测试** + +Run: `pnpm --filter @cutia/web test -- apps/web/src/lib/tts/openai-compatible.test.ts` +Expected: PASS + +**Step 2: 运行真实外部 TTS 验证** + +Run: `bun --eval '<补一段调用适配层的脚本>'` +Expected: 输出非空音频字节长度,不打印密钥。 + +**Step 3: 检查格式与类型** + +Run: `pnpm --filter @cutia/web lint` +Expected: PASS + +**Step 4: 整理工作台与提交内容** + +- 更新 Linear 工作台中的验收、验证和备注 +- 推送分支并创建 PR + +**Step 5: Commit** + +```bash +git add docs/plans/2026-03-17-tts-external-provider-design.md docs/plans/2026-03-17-tts-external-provider.md +git commit -m "docs: capture external tts plan" +``` diff --git a/packages/env/src/web.ts b/packages/env/src/web.ts index 4ec86fe..d0ffdbc 100644 --- a/packages/env/src/web.ts +++ b/packages/env/src/web.ts @@ -21,6 +21,9 @@ const webEnvSchema = z.object({ UPSTASH_REDIS_REST_TOKEN: z.string(), FREESOUND_CLIENT_ID: z.string().optional(), FREESOUND_API_KEY: z.string().optional(), + API_BASE_URL: z.string().optional(), + API_MODEL: z.string().optional(), + API_KEY: z.string().optional(), // Cloudflare R2 R2_ACCOUNT_ID: z.string().optional(), From 17979ae7c5786b0802d903958896de63e30c814b Mon Sep 17 00:00:00 2001 From: tianhei Date: Tue, 17 Mar 2026 19:29:54 +0800 Subject: [PATCH 02/22] fix(tts): address rework review findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: - add timeout handling for external and legacy TTS fetches and expand OpenAI-compatible endpoint probing to cover both /v1 and root paths - harden legacy fallback with GET length limits, HTTPS/host validation, and audio content-type checks - rethrow missing external config instead of silently falling back, add regression tests, and fix the plan doc heading level Rationale: - review feedback identified real security and reliability gaps in the fallback provider and request orchestration - the legacy provider only supports GET, so length guards are the safe mitigation for query-string leakage and URL size limits - explicit tests are needed to lock the config-error path and timeout behavior before reopening review Tests: - bun test apps/web/src/lib/tts/provider.test.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/legacy.test.ts - bunx @biomejs/biome check apps/web/src/app/api/tts/generate/route.ts apps/web/src/lib/tts/fetch-with-timeout.ts apps/web/src/lib/tts/legacy.ts apps/web/src/lib/tts/legacy.test.ts apps/web/src/lib/tts/openai-compatible.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/provider.ts apps/web/src/lib/tts/provider.test.ts apps/web/src/constants/tts-constants.ts packages/env/src/web.ts docs/plans/2026-03-17-tts-external-provider.md docs/plans/2026-03-17-tts-external-provider-design.md - bunx tsc -p apps/web/tsconfig.json --noEmit - NODE_ENV=development NEXT_PUBLIC_SITE_URL=http://localhost:4100 UPSTASH_REDIS_REST_URL=https://example.com UPSTASH_REDIS_REST_TOKEN=dummy bun --eval 'import { NextRequest } from "next/server"; import { POST } from "./src/app/api/tts/generate/route.ts"; const request = new NextRequest("http://localhost/api/tts/generate", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ text: "你好,Cutia。", voice: "default" }) }); const response = await POST(request); const json = await response.json(); console.log(JSON.stringify({ status: response.status, audioLength: json.audio?.length ?? 0, audioHead: json.audio?.slice(0, 8) ?? null, error: json.error ?? null }));' (workdir: apps/web) Co-authored-by: Codex --- apps/web/src/lib/tts/fetch-with-timeout.ts | 36 ++++++ apps/web/src/lib/tts/legacy.test.ts | 109 ++++++++++++++++++ apps/web/src/lib/tts/legacy.ts | 49 +++++++- .../web/src/lib/tts/openai-compatible.test.ts | 53 +++++++++ apps/web/src/lib/tts/openai-compatible.ts | 24 +++- apps/web/src/lib/tts/provider.test.ts | 24 ++++ apps/web/src/lib/tts/provider.ts | 9 +- .../plans/2026-03-17-tts-external-provider.md | 2 +- 8 files changed, 294 insertions(+), 12 deletions(-) create mode 100644 apps/web/src/lib/tts/fetch-with-timeout.ts create mode 100644 apps/web/src/lib/tts/legacy.test.ts diff --git a/apps/web/src/lib/tts/fetch-with-timeout.ts b/apps/web/src/lib/tts/fetch-with-timeout.ts new file mode 100644 index 0000000..49fbca7 --- /dev/null +++ b/apps/web/src/lib/tts/fetch-with-timeout.ts @@ -0,0 +1,36 @@ +type FetchLike = ( + input: RequestInfo | URL, + init?: RequestInit, +) => Promise; + +export async function fetchWithTimeout({ + fetchImpl, + input, + init, + timeoutMs, + timeoutMessage, +}: { + fetchImpl: FetchLike; + input: RequestInfo | URL; + init?: RequestInit; + timeoutMs: number; + timeoutMessage: string; +}): Promise { + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), timeoutMs); + + try { + return await fetchImpl(input, { + ...init, + signal: controller.signal, + }); + } catch (error) { + if (controller.signal.aborted) { + throw new Error(timeoutMessage); + } + + throw error; + } finally { + clearTimeout(timeoutId); + } +} diff --git a/apps/web/src/lib/tts/legacy.test.ts b/apps/web/src/lib/tts/legacy.test.ts new file mode 100644 index 0000000..f676ad2 --- /dev/null +++ b/apps/web/src/lib/tts/legacy.test.ts @@ -0,0 +1,109 @@ +import { describe, expect, test } from "bun:test"; +import { synthesizeSpeechWithLegacyProvider } from "./legacy"; + +describe("synthesizeSpeechWithLegacyProvider", () => { + test("rejects audio urls outside the expected https host allowlist", async () => { + const calls: string[] = []; + + await expect( + synthesizeSpeechWithLegacyProvider({ + text: "hello", + fetchImpl: async (input) => { + calls.push(String(input)); + return Response.json({ + code: 200, + url: "http://127.0.0.1/internal.mp3", + }); + }, + }), + ).rejects.toThrow("Legacy TTS returned an unexpected audio URL"); + + expect(calls).toHaveLength(1); + }); + + test("rejects non-audio content returned by the legacy audio download", async () => { + await expect( + synthesizeSpeechWithLegacyProvider({ + text: "hello", + fetchImpl: async (input) => { + if (String(input).includes("/apis/mbAIsc?")) { + return Response.json({ + code: 200, + url: "https://api.milorapart.top/voice/test.mp3", + }); + } + + return new Response("", { + status: 200, + headers: { "Content-Type": "text/html; charset=utf-8" }, + }); + }, + }), + ).rejects.toThrow("Legacy TTS returned non-audio content"); + }); + + test("rejects synthesis text that would exceed the legacy GET limit", async () => { + let fetchCalled = false; + + await expect( + synthesizeSpeechWithLegacyProvider({ + text: "中".repeat(400), + fetchImpl: async () => { + fetchCalled = true; + return Response.json({ + code: 200, + url: "https://api.milorapart.top/voice/test.mp3", + }); + }, + }), + ).rejects.toThrow("Legacy TTS text is too long for GET fallback"); + + expect(fetchCalled).toBe(false); + }); + + test("aborts the metadata request when the upstream hangs", async () => { + await expect( + synthesizeSpeechWithLegacyProvider({ + text: "hello", + timeoutMs: 10, + fetchImpl: async (_input, init) => + new Promise((_resolve, reject) => { + init?.signal?.addEventListener( + "abort", + () => reject(new Error("aborted")), + { once: true }, + ); + }), + }), + ).rejects.toThrow("Legacy TTS request timed out"); + }); + + test("aborts the audio download when the legacy audio fetch hangs", async () => { + let callCount = 0; + + await expect( + synthesizeSpeechWithLegacyProvider({ + text: "hello", + timeoutMs: 10, + fetchImpl: async (_input, init) => { + callCount++; + + if (callCount === 1) { + return Response.json({ + code: 200, + url: "https://api.milorapart.top/voice/test.mp3", + }); + } + + return new Promise((_resolve, reject) => { + init?.signal?.addEventListener( + "abort", + () => reject(new Error("aborted")), + { once: true }, + ); + }); + }, + }), + ).rejects.toThrow("Legacy TTS audio download timed out"); + }); +}); diff --git a/apps/web/src/lib/tts/legacy.ts b/apps/web/src/lib/tts/legacy.ts index 15243f2..30e9e41 100644 --- a/apps/web/src/lib/tts/legacy.ts +++ b/apps/web/src/lib/tts/legacy.ts @@ -1,6 +1,10 @@ import { z } from "zod"; +import { fetchWithTimeout } from "./fetch-with-timeout"; const LEGACY_TTS_API_BASE = "https://api.milorapart.top/apis/mbAIsc"; +const LEGACY_TTS_ALLOWED_AUDIO_HOSTS = new Set(["api.milorapart.top"]); +const LEGACY_TTS_TIMEOUT_MS = 15_000; +const LEGACY_TTS_MAX_URL_LENGTH = 1_800; const legacyResponseSchema = z.object({ code: z.number(), @@ -15,16 +19,29 @@ type FetchLike = ( export async function synthesizeSpeechWithLegacyProvider({ text, fetchImpl = fetch, + timeoutMs = LEGACY_TTS_TIMEOUT_MS, }: { text: string; voice?: string; fetchImpl?: FetchLike; + timeoutMs?: number; }): Promise { - const upstreamUrl = `${LEGACY_TTS_API_BASE}?${new URLSearchParams({ + const query = new URLSearchParams({ format: "mp3", text, - })}`; - const upstreamResponse = await fetchImpl(upstreamUrl); + }).toString(); + const upstreamUrl = `${LEGACY_TTS_API_BASE}?${query}`; + + if (upstreamUrl.length > LEGACY_TTS_MAX_URL_LENGTH) { + throw new Error("Legacy TTS text is too long for GET fallback"); + } + + const upstreamResponse = await fetchWithTimeout({ + fetchImpl, + input: upstreamUrl, + timeoutMessage: "Legacy TTS request timed out", + timeoutMs, + }); if (!upstreamResponse.ok) { throw new Error(`Legacy TTS request failed: ${upstreamResponse.status}`); @@ -37,7 +54,21 @@ export async function synthesizeSpeechWithLegacyProvider({ throw new Error("Legacy TTS generation failed"); } - const audioResponse = await fetchImpl(parsed.data.url); + const audioUrl = new URL(parsed.data.url); + + if ( + audioUrl.protocol !== "https:" || + !LEGACY_TTS_ALLOWED_AUDIO_HOSTS.has(audioUrl.hostname) + ) { + throw new Error("Legacy TTS returned an unexpected audio URL"); + } + + const audioResponse = await fetchWithTimeout({ + fetchImpl, + input: audioUrl, + timeoutMessage: "Legacy TTS audio download timed out", + timeoutMs, + }); if (!audioResponse.ok) { throw new Error( @@ -45,6 +76,16 @@ export async function synthesizeSpeechWithLegacyProvider({ ); } + const contentType = audioResponse.headers.get("content-type") ?? ""; + + if ( + contentType && + !contentType.includes("audio/") && + contentType !== "application/octet-stream" + ) { + throw new Error(`Legacy TTS returned non-audio content: ${contentType}`); + } + const audio = await audioResponse.arrayBuffer(); if (audio.byteLength === 0) { diff --git a/apps/web/src/lib/tts/openai-compatible.test.ts b/apps/web/src/lib/tts/openai-compatible.test.ts index 4769b48..86636a7 100644 --- a/apps/web/src/lib/tts/openai-compatible.test.ts +++ b/apps/web/src/lib/tts/openai-compatible.test.ts @@ -124,6 +124,36 @@ describe("synthesizeSpeechWithOpenAiCompatible", () => { ]); }); + test("tries the /v1 speech endpoint first when the base url is root-level", async () => { + const calls: string[] = []; + + const audio = await synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "default", + fetchImpl: async (input) => { + const url = String(input); + calls.push(url); + + if (url === "https://example.com/v1/audio/speech") { + return new Response(Uint8Array.from([5, 4, 3]), { + status: 200, + headers: { "Content-Type": "audio/mpeg" }, + }); + } + + return new Response("not found", { status: 404 }); + }, + }); + + expect(Array.from(new Uint8Array(audio))).toEqual([5, 4, 3]); + expect(calls[0]).toBe("https://example.com/v1/audio/speech"); + }); + test("rejects non-audio success responses", async () => { await expect( synthesizeSpeechWithOpenAiCompatible({ @@ -143,6 +173,29 @@ describe("synthesizeSpeechWithOpenAiCompatible", () => { ).rejects.toThrow("Expected audio response"); }); + test("aborts upstream requests that exceed the timeout", async () => { + await expect( + synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "default", + timeoutMs: 10, + fetchImpl: async (_input, init) => + new Promise((_resolve, reject) => { + init?.signal?.addEventListener( + "abort", + () => reject(new Error("aborted")), + { once: true }, + ); + }), + }), + ).rejects.toThrow("External TTS request timed out"); + }); + test("surfaces upstream text errors when JSON is unavailable", async () => { await expect( synthesizeSpeechWithOpenAiCompatible({ diff --git a/apps/web/src/lib/tts/openai-compatible.ts b/apps/web/src/lib/tts/openai-compatible.ts index 8396a1c..4f25e85 100644 --- a/apps/web/src/lib/tts/openai-compatible.ts +++ b/apps/web/src/lib/tts/openai-compatible.ts @@ -3,12 +3,14 @@ import { DEFAULT_EXTERNAL_TTS_VOICE, DEFAULT_VOICE_PACK, } from "@/constants/tts-constants"; +import { fetchWithTimeout } from "./fetch-with-timeout"; const externalTtsConfigSchema = z.object({ API_BASE_URL: z.string().min(1), API_MODEL: z.string().min(1), API_KEY: z.string().min(1), }); +const EXTERNAL_TTS_TIMEOUT_MS = 15_000; export { DEFAULT_EXTERNAL_TTS_VOICE }; @@ -92,11 +94,13 @@ function getSpeechEndpointUrls({ apiBaseUrl: string; }): string[] { const normalizedBaseUrl = apiBaseUrl.replace(/\/+$/, ""); - const urls = [`${normalizedBaseUrl}/audio/speech`]; - - if (normalizedBaseUrl.endsWith("/v1")) { - urls.push(`${normalizedBaseUrl.slice(0, -3)}/audio/speech`); - } + const baseWithoutV1 = normalizedBaseUrl.endsWith("/v1") + ? normalizedBaseUrl.slice(0, -3) + : normalizedBaseUrl; + const baseWithV1 = normalizedBaseUrl.endsWith("/v1") + ? normalizedBaseUrl + : `${normalizedBaseUrl}/v1`; + const urls = [`${baseWithV1}/audio/speech`, `${baseWithoutV1}/audio/speech`]; return [...new Set(urls)]; } @@ -106,11 +110,13 @@ export async function synthesizeSpeechWithOpenAiCompatible({ text, voice, fetchImpl = fetch, + timeoutMs = EXTERNAL_TTS_TIMEOUT_MS, }: { config: ExternalTtsConfig; text: string; voice?: string; fetchImpl?: FetchLike; + timeoutMs?: number; }): Promise { const endpointUrls = getSpeechEndpointUrls({ apiBaseUrl: config.apiBaseUrl, @@ -132,7 +138,13 @@ export async function synthesizeSpeechWithOpenAiCompatible({ let lastErrorResponse: Response | null = null; for (const endpointUrl of endpointUrls) { - const response = await fetchImpl(endpointUrl, requestInit); + const response = await fetchWithTimeout({ + fetchImpl, + init: requestInit, + input: endpointUrl, + timeoutMessage: "External TTS request timed out", + timeoutMs, + }); if (response.ok) { const contentType = response.headers.get("content-type") ?? ""; diff --git a/apps/web/src/lib/tts/provider.test.ts b/apps/web/src/lib/tts/provider.test.ts index 5b6f6d7..561076d 100644 --- a/apps/web/src/lib/tts/provider.test.ts +++ b/apps/web/src/lib/tts/provider.test.ts @@ -50,4 +50,28 @@ describe("synthesizeSpeechWithFallback", () => { expect(Array.from(new Uint8Array(result))).toEqual([7, 8, 9]); expect(legacyCalled).toBe(true); }); + + test("rethrows missing external config instead of silently falling back", async () => { + let openAiCalled = false; + let legacyCalled = false; + + await expect( + synthesizeSpeechWithFallback({ + env: {}, + text: "hello", + voice: "default", + openAiSynthesize: async () => { + openAiCalled = true; + return Uint8Array.from([1]).buffer; + }, + legacySynthesize: async () => { + legacyCalled = true; + return Uint8Array.from([9]).buffer; + }, + }), + ).rejects.toThrow("External TTS is not configured"); + + expect(openAiCalled).toBe(false); + expect(legacyCalled).toBe(false); + }); }); diff --git a/apps/web/src/lib/tts/provider.ts b/apps/web/src/lib/tts/provider.ts index 5b2c95a..9a0a02f 100644 --- a/apps/web/src/lib/tts/provider.ts +++ b/apps/web/src/lib/tts/provider.ts @@ -26,7 +26,14 @@ export async function synthesizeSpeechWithFallback({ try { const config = getExternalTtsConfig({ env }); return await openAiSynthesize({ config, text, voice }); - } catch { + } catch (error) { + if ( + error instanceof Error && + error.message === "External TTS is not configured" + ) { + throw error; + } + return legacySynthesize({ text, voice }); } } diff --git a/docs/plans/2026-03-17-tts-external-provider.md b/docs/plans/2026-03-17-tts-external-provider.md index 6c81b2d..a1ee5c9 100644 --- a/docs/plans/2026-03-17-tts-external-provider.md +++ b/docs/plans/2026-03-17-tts-external-provider.md @@ -10,7 +10,7 @@ --- -### Task 1: 补环境与 voice 常量基线 +## Task 1: 补环境与 voice 常量基线 **Files:** - Modify: `packages/env/src/web.ts` From e7c53476f2ce4d7b1f125f682ad4bbc34f6fd9d6 Mon Sep 17 00:00:00 2001 From: tianhei Date: Tue, 17 Mar 2026 23:22:04 +0800 Subject: [PATCH 03/22] fix(tts): harden provider response validation Summary: - reject legacy and OpenAI-compatible audio responses when the content-type header is missing instead of silently accepting them - preserve raw upstream error bodies when JSON payloads do not match the expected error schema and add regression tests for both cases - normalize the external TTS plan task headings to a consistent level Rationale: - CodeRabbit found two real validation gaps that allowed untyped success payloads through and one bug where JSON error parsing consumed the body before the text fallback could read it - locking these paths with targeted tests keeps the rework focused on the new review findings instead of broad refactoring - fixing the plan heading mismatch removes a repeated doc-only review item Tests: - bun test apps/web/src/lib/tts/provider.test.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/legacy.test.ts - bunx @biomejs/biome check apps/web/src/app/api/tts/generate/route.ts apps/web/src/lib/tts/fetch-with-timeout.ts apps/web/src/lib/tts/legacy.ts apps/web/src/lib/tts/legacy.test.ts apps/web/src/lib/tts/openai-compatible.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/provider.ts apps/web/src/lib/tts/provider.test.ts apps/web/src/constants/tts-constants.ts packages/env/src/web.ts docs/plans/2026-03-17-tts-external-provider.md docs/plans/2026-03-17-tts-external-provider-design.md - bunx tsc -p apps/web/tsconfig.json --noEmit - NODE_ENV=development NEXT_PUBLIC_SITE_URL=http://localhost:4100 UPSTASH_REDIS_REST_URL=https://example.com UPSTASH_REDIS_REST_TOKEN=dummy bun --eval 'import { NextRequest } from "next/server"; import { POST } from "./src/app/api/tts/generate/route.ts"; const request = new NextRequest("http://localhost/api/tts/generate", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ text: "hello", voice: "default" }) }); const response = await POST(request); const json = await response.json(); console.log(JSON.stringify({ status: response.status, audioLength: json.audio?.length ?? 0, audioHead: json.audio?.slice(0, 8) ?? null, error: json.error ?? null }));' (workdir: apps/web) Co-authored-by: Codex --- apps/web/src/lib/tts/legacy.test.ts | 20 ++++++++++ apps/web/src/lib/tts/legacy.ts | 5 ++- .../web/src/lib/tts/openai-compatible.test.ts | 37 +++++++++++++++++++ apps/web/src/lib/tts/openai-compatible.ts | 27 +++++++++----- .../plans/2026-03-17-tts-external-provider.md | 6 +-- 5 files changed, 80 insertions(+), 15 deletions(-) diff --git a/apps/web/src/lib/tts/legacy.test.ts b/apps/web/src/lib/tts/legacy.test.ts index f676ad2..396119b 100644 --- a/apps/web/src/lib/tts/legacy.test.ts +++ b/apps/web/src/lib/tts/legacy.test.ts @@ -42,6 +42,26 @@ describe("synthesizeSpeechWithLegacyProvider", () => { ).rejects.toThrow("Legacy TTS returned non-audio content"); }); + test("rejects audio downloads when the content-type header is missing", async () => { + await expect( + synthesizeSpeechWithLegacyProvider({ + text: "hello", + fetchImpl: async (input) => { + if (String(input).includes("/apis/mbAIsc?")) { + return Response.json({ + code: 200, + url: "https://api.milorapart.top/voice/test.mp3", + }); + } + + return new Response(Uint8Array.from([1, 2, 3]), { + status: 200, + }); + }, + }), + ).rejects.toThrow("Legacy TTS returned non-audio content"); + }); + test("rejects synthesis text that would exceed the legacy GET limit", async () => { let fetchCalled = false; diff --git a/apps/web/src/lib/tts/legacy.ts b/apps/web/src/lib/tts/legacy.ts index 30e9e41..189a528 100644 --- a/apps/web/src/lib/tts/legacy.ts +++ b/apps/web/src/lib/tts/legacy.ts @@ -79,11 +79,12 @@ export async function synthesizeSpeechWithLegacyProvider({ const contentType = audioResponse.headers.get("content-type") ?? ""; if ( - contentType && !contentType.includes("audio/") && contentType !== "application/octet-stream" ) { - throw new Error(`Legacy TTS returned non-audio content: ${contentType}`); + throw new Error( + `Legacy TTS returned non-audio content: ${contentType || "(no content-type)"}`, + ); } const audio = await audioResponse.arrayBuffer(); diff --git a/apps/web/src/lib/tts/openai-compatible.test.ts b/apps/web/src/lib/tts/openai-compatible.test.ts index 86636a7..68f0434 100644 --- a/apps/web/src/lib/tts/openai-compatible.test.ts +++ b/apps/web/src/lib/tts/openai-compatible.test.ts @@ -173,6 +173,24 @@ describe("synthesizeSpeechWithOpenAiCompatible", () => { ).rejects.toThrow("Expected audio response"); }); + test("rejects success responses when the content-type header is missing", async () => { + await expect( + synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "default", + fetchImpl: async () => + new Response(Uint8Array.from([1, 2, 3]), { + status: 200, + }), + }), + ).rejects.toThrow("Expected audio response"); + }); + test("aborts upstream requests that exceed the timeout", async () => { await expect( synthesizeSpeechWithOpenAiCompatible({ @@ -214,4 +232,23 @@ describe("synthesizeSpeechWithOpenAiCompatible", () => { }), ).rejects.toThrow("gateway timeout"); }); + + test("falls back to the raw upstream body when JSON shape is unrecognized", async () => { + await expect( + synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "nova", + fetchImpl: async () => + new Response('{"message":"bad request"}', { + status: 400, + headers: { "Content-Type": "application/json" }, + }), + }), + ).rejects.toThrow('{"message":"bad request"}'); + }); }); diff --git a/apps/web/src/lib/tts/openai-compatible.ts b/apps/web/src/lib/tts/openai-compatible.ts index 4f25e85..ab54d8e 100644 --- a/apps/web/src/lib/tts/openai-compatible.ts +++ b/apps/web/src/lib/tts/openai-compatible.ts @@ -57,15 +57,22 @@ async function getUpstreamErrorMessage({ response: Response; }): Promise { const contentType = response.headers.get("content-type") ?? ""; + const text = await response.text().catch(() => ""); if (contentType.includes("application/json")) { - const json = (await response.json().catch(() => null)) as { - error?: - | string - | { - message?: string; - }; - } | null; + const json = (() => { + try { + return JSON.parse(text) as { + error?: + | string + | { + message?: string; + }; + } | null; + } catch { + return null; + } + })(); if (typeof json?.error === "string" && json.error.trim()) { return json.error; @@ -80,7 +87,6 @@ async function getUpstreamErrorMessage({ } } - const text = await response.text().catch(() => ""); if (text.trim()) { return text; } @@ -150,11 +156,12 @@ export async function synthesizeSpeechWithOpenAiCompatible({ const contentType = response.headers.get("content-type") ?? ""; if ( - contentType && !contentType.includes("audio/") && contentType !== "application/octet-stream" ) { - throw new Error(`Expected audio response, received ${contentType}`); + throw new Error( + `Expected audio response, received ${contentType || "(no content-type)"}`, + ); } const audio = await response.arrayBuffer(); diff --git a/docs/plans/2026-03-17-tts-external-provider.md b/docs/plans/2026-03-17-tts-external-provider.md index a1ee5c9..88c520b 100644 --- a/docs/plans/2026-03-17-tts-external-provider.md +++ b/docs/plans/2026-03-17-tts-external-provider.md @@ -44,7 +44,7 @@ git add packages/env/src/web.ts apps/web/src/constants/tts-constants.ts git commit -m "feat: prepare external tts config" ``` -### Task 2: 先写失败测试覆盖外部 TTS 适配层 +## Task 2: 先写失败测试覆盖外部 TTS 适配层 **Files:** - Create: `apps/web/src/lib/tts/openai-compatible.test.ts` @@ -80,7 +80,7 @@ git add apps/web/src/lib/tts/openai-compatible.ts apps/web/src/lib/tts/openai-co git commit -m "feat: add external tts adapter" ``` -### Task 3: 接回 API 路由 +## Task 3: 接回 API 路由 **Files:** - Modify: `apps/web/src/app/api/tts/generate/route.ts` @@ -113,7 +113,7 @@ git add apps/web/src/app/api/tts/generate/route.ts git commit -m "feat: wire route to external tts provider" ``` -### Task 4: 端到端验证与整理 +## Task 4: 端到端验证与整理 **Files:** - Modify: `docs/plans/2026-03-17-tts-external-provider-design.md` From 7b0376f4fe7cbbc225b5a160b1e212ba1fef6cdb Mon Sep 17 00:00:00 2001 From: tianhei Date: Tue, 17 Mar 2026 23:36:47 +0800 Subject: [PATCH 04/22] fix(tts): preserve aborts and normalize config parsing Summary: - compose caller cancellation with timeout handling in fetchWithTimeout and cover immediate and in-flight aborts with tests - normalize legacy and OpenAI-compatible MIME checks so valid audio types with casing or parameters are accepted - trim external TTS config values before validation and remove the assistant-specific directive from the plan doc Rationale: - CodeRabbit identified a real correctness issue where caller aborts were overwritten by timeout logic inside the fetch wrapper - MIME checks should validate the media type, not the raw header string, or valid responses can be rejected unnecessarily - whitespace-only config values should fail fast instead of surfacing opaque upstream errors later in the request path Tests: - bun test apps/web/src/lib/tts/fetch-with-timeout.test.ts apps/web/src/lib/tts/provider.test.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/legacy.test.ts - bunx @biomejs/biome check apps/web/src/app/api/tts/generate/route.ts apps/web/src/lib/tts/fetch-with-timeout.ts apps/web/src/lib/tts/fetch-with-timeout.test.ts apps/web/src/lib/tts/legacy.ts apps/web/src/lib/tts/legacy.test.ts apps/web/src/lib/tts/openai-compatible.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/provider.ts apps/web/src/lib/tts/provider.test.ts apps/web/src/constants/tts-constants.ts packages/env/src/web.ts docs/plans/2026-03-17-tts-external-provider.md docs/plans/2026-03-17-tts-external-provider-design.md - bunx tsc -p apps/web/tsconfig.json --noEmit - NODE_ENV=development NEXT_PUBLIC_SITE_URL=http://localhost:4100 UPSTASH_REDIS_REST_URL=https://example.com UPSTASH_REDIS_REST_TOKEN=dummy bun --eval 'import { NextRequest } from "next/server"; import { POST } from "./src/app/api/tts/generate/route.ts"; const request = new NextRequest("http://localhost/api/tts/generate", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ text: "hello", voice: "default" }) }); const response = await POST(request); const json = await response.json(); console.log(JSON.stringify({ status: response.status, audioLength: json.audio?.length ?? 0, audioHead: json.audio?.slice(0, 8) ?? null, error: json.error ?? null }));' (workdir: apps/web) Co-authored-by: Codex --- .../src/lib/tts/fetch-with-timeout.test.ts | 51 +++++++++++++++++++ apps/web/src/lib/tts/fetch-with-timeout.ts | 30 ++++++++++- apps/web/src/lib/tts/legacy.test.ts | 21 ++++++++ apps/web/src/lib/tts/legacy.ts | 5 +- .../web/src/lib/tts/openai-compatible.test.ts | 31 +++++++++++ apps/web/src/lib/tts/openai-compatible.ts | 19 +++++-- .../plans/2026-03-17-tts-external-provider.md | 2 - 7 files changed, 148 insertions(+), 11 deletions(-) create mode 100644 apps/web/src/lib/tts/fetch-with-timeout.test.ts diff --git a/apps/web/src/lib/tts/fetch-with-timeout.test.ts b/apps/web/src/lib/tts/fetch-with-timeout.test.ts new file mode 100644 index 0000000..880fc93 --- /dev/null +++ b/apps/web/src/lib/tts/fetch-with-timeout.test.ts @@ -0,0 +1,51 @@ +import { describe, expect, test } from "bun:test"; +import { fetchWithTimeout } from "./fetch-with-timeout"; + +describe("fetchWithTimeout", () => { + test("rejects immediately when the caller signal is already aborted", async () => { + const controller = new AbortController(); + const callerError = new Error("caller aborted"); + let fetchCalled = false; + + controller.abort(callerError); + + await expect( + fetchWithTimeout({ + fetchImpl: async () => { + fetchCalled = true; + return new Response("ok"); + }, + init: { signal: controller.signal }, + input: "https://example.com", + timeoutMessage: "timed out", + timeoutMs: 50, + }), + ).rejects.toThrow("caller aborted"); + + expect(fetchCalled).toBe(false); + }); + + test("surfaces caller cancellation for in-flight requests", async () => { + const controller = new AbortController(); + const callerError = new Error("caller aborted"); + + await expect( + fetchWithTimeout({ + fetchImpl: async (_input, init) => + new Promise((_resolve, reject) => { + setTimeout(() => controller.abort(callerError), 0); + + init?.signal?.addEventListener( + "abort", + () => reject(init.signal?.reason ?? new Error("aborted")), + { once: true }, + ); + }), + init: { signal: controller.signal }, + input: "https://example.com", + timeoutMessage: "timed out", + timeoutMs: 50, + }), + ).rejects.toThrow("caller aborted"); + }); +}); diff --git a/apps/web/src/lib/tts/fetch-with-timeout.ts b/apps/web/src/lib/tts/fetch-with-timeout.ts index 49fbca7..cc6960e 100644 --- a/apps/web/src/lib/tts/fetch-with-timeout.ts +++ b/apps/web/src/lib/tts/fetch-with-timeout.ts @@ -17,7 +17,24 @@ export async function fetchWithTimeout({ timeoutMessage: string; }): Promise { const controller = new AbortController(); - const timeoutId = setTimeout(() => controller.abort(), timeoutMs); + const callerSignal = init?.signal; + let didTimeout = false; + const abortFromCaller = () => controller.abort(callerSignal?.reason); + + if (callerSignal?.aborted) { + if (callerSignal.reason instanceof Error) { + throw callerSignal.reason; + } + + throw new Error(String(callerSignal.reason ?? "Request aborted")); + } + + callerSignal?.addEventListener("abort", abortFromCaller, { once: true }); + + const timeoutId = setTimeout(() => { + didTimeout = true; + controller.abort(new Error(timeoutMessage)); + }, timeoutMs); try { return await fetchImpl(input, { @@ -25,12 +42,21 @@ export async function fetchWithTimeout({ signal: controller.signal, }); } catch (error) { - if (controller.signal.aborted) { + if (didTimeout) { throw new Error(timeoutMessage); } + if (callerSignal?.aborted) { + if (callerSignal.reason instanceof Error) { + throw callerSignal.reason; + } + + throw new Error(String(callerSignal.reason ?? "Request aborted")); + } + throw error; } finally { clearTimeout(timeoutId); + callerSignal?.removeEventListener("abort", abortFromCaller); } } diff --git a/apps/web/src/lib/tts/legacy.test.ts b/apps/web/src/lib/tts/legacy.test.ts index 396119b..6ece491 100644 --- a/apps/web/src/lib/tts/legacy.test.ts +++ b/apps/web/src/lib/tts/legacy.test.ts @@ -62,6 +62,27 @@ describe("synthesizeSpeechWithLegacyProvider", () => { ).rejects.toThrow("Legacy TTS returned non-audio content"); }); + test("accepts audio downloads when the MIME type casing and parameters vary", async () => { + const audio = await synthesizeSpeechWithLegacyProvider({ + text: "hello", + fetchImpl: async (input) => { + if (String(input).includes("/apis/mbAIsc?")) { + return Response.json({ + code: 200, + url: "https://api.milorapart.top/voice/test.mp3", + }); + } + + return new Response(Uint8Array.from([1, 2, 3]), { + status: 200, + headers: { "Content-Type": "Audio/MPEG; Charset=utf-8" }, + }); + }, + }); + + expect(Array.from(new Uint8Array(audio))).toEqual([1, 2, 3]); + }); + test("rejects synthesis text that would exceed the legacy GET limit", async () => { let fetchCalled = false; diff --git a/apps/web/src/lib/tts/legacy.ts b/apps/web/src/lib/tts/legacy.ts index 189a528..c796f45 100644 --- a/apps/web/src/lib/tts/legacy.ts +++ b/apps/web/src/lib/tts/legacy.ts @@ -77,10 +77,11 @@ export async function synthesizeSpeechWithLegacyProvider({ } const contentType = audioResponse.headers.get("content-type") ?? ""; + const mimeType = contentType.split(";")[0]?.trim().toLowerCase() ?? ""; if ( - !contentType.includes("audio/") && - contentType !== "application/octet-stream" + !mimeType.startsWith("audio/") && + mimeType !== "application/octet-stream" ) { throw new Error( `Legacy TTS returned non-audio content: ${contentType || "(no content-type)"}`, diff --git a/apps/web/src/lib/tts/openai-compatible.test.ts b/apps/web/src/lib/tts/openai-compatible.test.ts index 68f0434..58883cc 100644 --- a/apps/web/src/lib/tts/openai-compatible.test.ts +++ b/apps/web/src/lib/tts/openai-compatible.test.ts @@ -32,6 +32,18 @@ describe("getExternalTtsConfig", () => { }), ).toThrow("External TTS is not configured"); }); + + test("rejects whitespace-only config values", () => { + expect(() => + getExternalTtsConfig({ + env: { + API_BASE_URL: " ", + API_MODEL: " ", + API_KEY: " ", + }, + }), + ).toThrow("External TTS is not configured"); + }); }); describe("synthesizeSpeechWithOpenAiCompatible", () => { @@ -191,6 +203,25 @@ describe("synthesizeSpeechWithOpenAiCompatible", () => { ).rejects.toThrow("Expected audio response"); }); + test("accepts audio responses when MIME type casing and parameters vary", async () => { + const audio = await synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "default", + fetchImpl: async () => + new Response(Uint8Array.from([1, 2, 3]), { + status: 200, + headers: { "Content-Type": "Audio/MPEG; Charset=utf-8" }, + }), + }); + + expect(Array.from(new Uint8Array(audio))).toEqual([1, 2, 3]); + }); + test("aborts upstream requests that exceed the timeout", async () => { await expect( synthesizeSpeechWithOpenAiCompatible({ diff --git a/apps/web/src/lib/tts/openai-compatible.ts b/apps/web/src/lib/tts/openai-compatible.ts index ab54d8e..e4821f7 100644 --- a/apps/web/src/lib/tts/openai-compatible.ts +++ b/apps/web/src/lib/tts/openai-compatible.ts @@ -36,10 +36,18 @@ export function getExternalTtsConfig({ throw new Error("External TTS is not configured"); } + const apiBaseUrl = parsed.data.API_BASE_URL.trim().replace(/\/+$/, ""); + const apiKey = parsed.data.API_KEY.trim(); + const model = parsed.data.API_MODEL.trim(); + + if (!apiBaseUrl || !apiKey || !model) { + throw new Error("External TTS is not configured"); + } + return { - apiBaseUrl: parsed.data.API_BASE_URL.replace(/\/+$/, ""), - apiKey: parsed.data.API_KEY, - model: parsed.data.API_MODEL, + apiBaseUrl, + apiKey, + model, }; } @@ -154,10 +162,11 @@ export async function synthesizeSpeechWithOpenAiCompatible({ if (response.ok) { const contentType = response.headers.get("content-type") ?? ""; + const mimeType = contentType.split(";")[0]?.trim().toLowerCase() ?? ""; if ( - !contentType.includes("audio/") && - contentType !== "application/octet-stream" + !mimeType.startsWith("audio/") && + mimeType !== "application/octet-stream" ) { throw new Error( `Expected audio response, received ${contentType || "(no content-type)"}`, diff --git a/docs/plans/2026-03-17-tts-external-provider.md b/docs/plans/2026-03-17-tts-external-provider.md index 88c520b..34a07e7 100644 --- a/docs/plans/2026-03-17-tts-external-provider.md +++ b/docs/plans/2026-03-17-tts-external-provider.md @@ -1,7 +1,5 @@ # 外部 TTS 扩展 Implementation Plan -> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. - **Goal:** 让 Cutia 的 TTS 能力从硬编码单一路由改为可配置的外部 TTS API 调用,并继续把生成语音接入媒体库和时间线。 **Architecture:** 在 `apps/web/src/lib/tts/` 新增可测试的 OpenAI 兼容 TTS 适配层,`/api/tts/generate` 只负责校验和响应转换,前端调用协议保持 `{ audio }` 不变。通过 `packages/env` 暴露配置,避免把供应商细节散落到 UI 和编辑器逻辑里。 From d7c896a98a65684bd9b63781b6b136e8e5ccd342 Mon Sep 17 00:00:00 2001 From: tianhei Date: Tue, 17 Mar 2026 23:43:37 +0800 Subject: [PATCH 05/22] refactor(tts): clean up provider helper types Summary: - export FetchLike from fetch-with-timeout and reuse it across the legacy and OpenAI-compatible adapters - extract the duplicated caller-abort error construction into a shared helper inside fetch-with-timeout - keep the legacy voice argument for adapter parity while making its intent explicit in code Rationale: - the remaining CodeRabbit comments were all maintenance-only cleanup items with no behavior change required - sharing the helper type and abort-reason logic reduces repetition without widening the TTS error-handling surface - documenting the unused legacy voice parameter makes the interface parity intentional instead of accidental Tests: - bun test apps/web/src/lib/tts/fetch-with-timeout.test.ts apps/web/src/lib/tts/provider.test.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/legacy.test.ts - bunx @biomejs/biome check apps/web/src/lib/tts/fetch-with-timeout.ts apps/web/src/lib/tts/fetch-with-timeout.test.ts apps/web/src/lib/tts/legacy.ts apps/web/src/lib/tts/openai-compatible.ts docs/plans/2026-03-17-tts-external-provider.md - bunx tsc -p apps/web/tsconfig.json --noEmit Co-authored-by: Codex --- apps/web/src/lib/tts/fetch-with-timeout.ts | 22 +++++++++++----------- apps/web/src/lib/tts/legacy.ts | 10 ++++------ apps/web/src/lib/tts/openai-compatible.ts | 7 +------ 3 files changed, 16 insertions(+), 23 deletions(-) diff --git a/apps/web/src/lib/tts/fetch-with-timeout.ts b/apps/web/src/lib/tts/fetch-with-timeout.ts index cc6960e..47ab95b 100644 --- a/apps/web/src/lib/tts/fetch-with-timeout.ts +++ b/apps/web/src/lib/tts/fetch-with-timeout.ts @@ -1,8 +1,16 @@ -type FetchLike = ( +export type FetchLike = ( input: RequestInfo | URL, init?: RequestInit, ) => Promise; +function throwCallerAbortReason(signal: AbortSignal): never { + if (signal.reason instanceof Error) { + throw signal.reason; + } + + throw new Error(String(signal.reason ?? "Request aborted")); +} + export async function fetchWithTimeout({ fetchImpl, input, @@ -22,11 +30,7 @@ export async function fetchWithTimeout({ const abortFromCaller = () => controller.abort(callerSignal?.reason); if (callerSignal?.aborted) { - if (callerSignal.reason instanceof Error) { - throw callerSignal.reason; - } - - throw new Error(String(callerSignal.reason ?? "Request aborted")); + throwCallerAbortReason(callerSignal); } callerSignal?.addEventListener("abort", abortFromCaller, { once: true }); @@ -47,11 +51,7 @@ export async function fetchWithTimeout({ } if (callerSignal?.aborted) { - if (callerSignal.reason instanceof Error) { - throw callerSignal.reason; - } - - throw new Error(String(callerSignal.reason ?? "Request aborted")); + throwCallerAbortReason(callerSignal); } throw error; diff --git a/apps/web/src/lib/tts/legacy.ts b/apps/web/src/lib/tts/legacy.ts index c796f45..83b6d88 100644 --- a/apps/web/src/lib/tts/legacy.ts +++ b/apps/web/src/lib/tts/legacy.ts @@ -1,5 +1,5 @@ import { z } from "zod"; -import { fetchWithTimeout } from "./fetch-with-timeout"; +import { fetchWithTimeout, type FetchLike } from "./fetch-with-timeout"; const LEGACY_TTS_API_BASE = "https://api.milorapart.top/apis/mbAIsc"; const LEGACY_TTS_ALLOWED_AUDIO_HOSTS = new Set(["api.milorapart.top"]); @@ -11,13 +11,9 @@ const legacyResponseSchema = z.object({ url: z.string().url(), }); -type FetchLike = ( - input: RequestInfo | URL, - init?: RequestInit, -) => Promise; - export async function synthesizeSpeechWithLegacyProvider({ text, + voice: _voice, fetchImpl = fetch, timeoutMs = LEGACY_TTS_TIMEOUT_MS, }: { @@ -26,6 +22,8 @@ export async function synthesizeSpeechWithLegacyProvider({ fetchImpl?: FetchLike; timeoutMs?: number; }): Promise { + void _voice; // Legacy upstream has a fixed voice; keep the arg for parity. + const query = new URLSearchParams({ format: "mp3", text, diff --git a/apps/web/src/lib/tts/openai-compatible.ts b/apps/web/src/lib/tts/openai-compatible.ts index e4821f7..ccea62c 100644 --- a/apps/web/src/lib/tts/openai-compatible.ts +++ b/apps/web/src/lib/tts/openai-compatible.ts @@ -3,7 +3,7 @@ import { DEFAULT_EXTERNAL_TTS_VOICE, DEFAULT_VOICE_PACK, } from "@/constants/tts-constants"; -import { fetchWithTimeout } from "./fetch-with-timeout"; +import { fetchWithTimeout, type FetchLike } from "./fetch-with-timeout"; const externalTtsConfigSchema = z.object({ API_BASE_URL: z.string().min(1), @@ -20,11 +20,6 @@ export interface ExternalTtsConfig { model: string; } -type FetchLike = ( - input: RequestInfo | URL, - init?: RequestInit, -) => Promise; - export function getExternalTtsConfig({ env, }: { From d71ca3440d4cf3c9511de08c2613c61309f0558d Mon Sep 17 00:00:00 2001 From: tianhei Date: Thu, 19 Mar 2026 02:55:55 +0800 Subject: [PATCH 06/22] fix(tts): use structured provider error codes Summary: - add shared TTS error codes and typed TtsError helpers - emit structured config and upstream failures from TTS providers - map route responses by error code and add regression tests Rationale: - avoid coupling API status mapping to fragile message text - keep provider fallback behavior explicit while preserving readable errors - close the remaining PR review item with direct route-level coverage Tests: - bun test apps/web/src/lib/tts/fetch-with-timeout.test.ts apps/web/src/lib/tts/provider.test.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/legacy.test.ts apps/web/src/app/api/tts/generate/route.test.ts - bunx @biomejs/biome check apps/web/src/app/api/tts/generate/route.ts apps/web/src/app/api/tts/generate/route.test.ts apps/web/src/lib/tts/provider.ts apps/web/src/lib/tts/provider.test.ts apps/web/src/lib/tts/openai-compatible.ts apps/web/src/lib/tts/legacy.ts apps/web/src/lib/tts/errors.ts - bunx tsc -p apps/web/tsconfig.json --noEmit - POST /api/tts/generate probe: status=200 audioHead=SUQzBAAA Co-authored-by: Codex --- .../src/app/api/tts/generate/route.test.ts | 85 +++++++++++++++++++ apps/web/src/app/api/tts/generate/route.ts | 19 ++--- apps/web/src/lib/tts/errors.ts | 31 +++++++ apps/web/src/lib/tts/legacy.ts | 40 ++++++--- apps/web/src/lib/tts/openai-compatible.ts | 30 +++++-- apps/web/src/lib/tts/provider.test.ts | 13 +++ apps/web/src/lib/tts/provider.ts | 6 +- 7 files changed, 190 insertions(+), 34 deletions(-) create mode 100644 apps/web/src/app/api/tts/generate/route.test.ts create mode 100644 apps/web/src/lib/tts/errors.ts diff --git a/apps/web/src/app/api/tts/generate/route.test.ts b/apps/web/src/app/api/tts/generate/route.test.ts new file mode 100644 index 0000000..a2849f8 --- /dev/null +++ b/apps/web/src/app/api/tts/generate/route.test.ts @@ -0,0 +1,85 @@ +import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test"; + +let synthesizeImpl: typeof import("@/lib/tts/provider").synthesizeSpeechWithFallback; +const originalConsoleError = console.error; + +mock.module("@cutia/env/web", () => ({ + webEnv: { + API_BASE_URL: "https://example.com/v1", + API_MODEL: "tts-1", + API_KEY: "secret", + }, +})); + +mock.module("@/lib/tts/provider", () => ({ + synthesizeSpeechWithFallback: (args: Parameters[0]) => + synthesizeImpl(args), +})); + +const { POST } = await import("./route"); + +function createRequest(body: unknown): Request { + return new Request("http://localhost/api/tts/generate", { + body: JSON.stringify(body), + headers: { + "content-type": "application/json", + }, + method: "POST", + }); +} + +describe("POST /api/tts/generate", () => { + beforeEach(() => { + console.error = mock(() => {}); + synthesizeImpl = async () => Uint8Array.from([1, 2, 3]).buffer; + }); + + afterEach(() => { + console.error = originalConsoleError; + }); + + test("returns 502 for structured legacy upstream errors without relying on message prefixes", async () => { + synthesizeImpl = async () => { + throw Object.assign(new Error("legacy fallback audio download failed"), { + code: "LEGACY_TTS_UPSTREAM", + }); + }; + + const response = await POST(createRequest({ text: "hello" }) as never); + + expect(response.status).toBe(502); + expect(await response.json()).toEqual({ + error: "legacy fallback audio download failed", + }); + }); + + test("returns 502 for structured external upstream errors without relying on message prefixes", async () => { + synthesizeImpl = async () => { + throw Object.assign(new Error("upstream gateway timeout"), { + code: "EXTERNAL_TTS_UPSTREAM", + }); + }; + + const response = await POST(createRequest({ text: "hello" }) as never); + + expect(response.status).toBe(502); + expect(await response.json()).toEqual({ + error: "upstream gateway timeout", + }); + }); + + test("returns the original config error message for structured config failures", async () => { + synthesizeImpl = async () => { + throw Object.assign(new Error("external config missing"), { + code: "EXTERNAL_TTS_CONFIG", + }); + }; + + const response = await POST(createRequest({ text: "hello" }) as never); + + expect(response.status).toBe(500); + expect(await response.json()).toEqual({ + error: "external config missing", + }); + }); +}); diff --git a/apps/web/src/app/api/tts/generate/route.ts b/apps/web/src/app/api/tts/generate/route.ts index 9155317..5e70366 100644 --- a/apps/web/src/app/api/tts/generate/route.ts +++ b/apps/web/src/app/api/tts/generate/route.ts @@ -1,6 +1,7 @@ import { webEnv } from "@cutia/env/web"; import { type NextRequest, NextResponse } from "next/server"; import { z } from "zod"; +import { isTtsError } from "@/lib/tts/errors"; import { synthesizeSpeechWithFallback } from "@/lib/tts/provider"; const requestSchema = z.object({ @@ -36,16 +37,14 @@ export async function POST(request: NextRequest) { const message = error instanceof Error ? error.message : "Unknown error"; console.error("TTS generate error:", error); - if (message === "External TTS is not configured") { - return NextResponse.json({ error: message }, { status: 500 }); - } - - if ( - message.startsWith("External TTS request failed:") || - message === "External TTS returned empty audio" || - message.startsWith("Legacy TTS ") - ) { - return NextResponse.json({ error: message }, { status: 502 }); + if (isTtsError(error)) { + switch (error.code) { + case "EXTERNAL_TTS_CONFIG": + return NextResponse.json({ error: message }, { status: 500 }); + case "EXTERNAL_TTS_UPSTREAM": + case "LEGACY_TTS_UPSTREAM": + return NextResponse.json({ error: message }, { status: 502 }); + } } return NextResponse.json( diff --git a/apps/web/src/lib/tts/errors.ts b/apps/web/src/lib/tts/errors.ts new file mode 100644 index 0000000..d3f7bc4 --- /dev/null +++ b/apps/web/src/lib/tts/errors.ts @@ -0,0 +1,31 @@ +export const TTS_ERROR_CODES = [ + "EXTERNAL_TTS_CONFIG", + "EXTERNAL_TTS_UPSTREAM", + "LEGACY_TTS_UPSTREAM", +] as const; + +export type TtsErrorCode = (typeof TTS_ERROR_CODES)[number]; + +export class TtsError extends Error { + code: TtsErrorCode; + + constructor({ + code, + message, + }: { + code: TtsErrorCode; + message: string; + }) { + super(message); + this.name = "TtsError"; + this.code = code; + } +} + +export function isTtsError(error: unknown): error is TtsError { + if (!(error instanceof Error)) { + return false; + } + + return TTS_ERROR_CODES.includes((error as TtsError).code); +} diff --git a/apps/web/src/lib/tts/legacy.ts b/apps/web/src/lib/tts/legacy.ts index 83b6d88..72bd5c2 100644 --- a/apps/web/src/lib/tts/legacy.ts +++ b/apps/web/src/lib/tts/legacy.ts @@ -1,5 +1,6 @@ import { z } from "zod"; import { fetchWithTimeout, type FetchLike } from "./fetch-with-timeout"; +import { TtsError } from "./errors"; const LEGACY_TTS_API_BASE = "https://api.milorapart.top/apis/mbAIsc"; const LEGACY_TTS_ALLOWED_AUDIO_HOSTS = new Set(["api.milorapart.top"]); @@ -31,7 +32,10 @@ export async function synthesizeSpeechWithLegacyProvider({ const upstreamUrl = `${LEGACY_TTS_API_BASE}?${query}`; if (upstreamUrl.length > LEGACY_TTS_MAX_URL_LENGTH) { - throw new Error("Legacy TTS text is too long for GET fallback"); + throw new TtsError({ + code: "LEGACY_TTS_UPSTREAM", + message: "Legacy TTS text is too long for GET fallback", + }); } const upstreamResponse = await fetchWithTimeout({ @@ -42,14 +46,20 @@ export async function synthesizeSpeechWithLegacyProvider({ }); if (!upstreamResponse.ok) { - throw new Error(`Legacy TTS request failed: ${upstreamResponse.status}`); + throw new TtsError({ + code: "LEGACY_TTS_UPSTREAM", + message: `Legacy TTS request failed: ${upstreamResponse.status}`, + }); } const upstreamJson = await upstreamResponse.json().catch(() => null); const parsed = legacyResponseSchema.safeParse(upstreamJson); if (!parsed.success || parsed.data.code !== 200) { - throw new Error("Legacy TTS generation failed"); + throw new TtsError({ + code: "LEGACY_TTS_UPSTREAM", + message: "Legacy TTS generation failed", + }); } const audioUrl = new URL(parsed.data.url); @@ -58,7 +68,10 @@ export async function synthesizeSpeechWithLegacyProvider({ audioUrl.protocol !== "https:" || !LEGACY_TTS_ALLOWED_AUDIO_HOSTS.has(audioUrl.hostname) ) { - throw new Error("Legacy TTS returned an unexpected audio URL"); + throw new TtsError({ + code: "LEGACY_TTS_UPSTREAM", + message: "Legacy TTS returned an unexpected audio URL", + }); } const audioResponse = await fetchWithTimeout({ @@ -69,9 +82,10 @@ export async function synthesizeSpeechWithLegacyProvider({ }); if (!audioResponse.ok) { - throw new Error( - `Legacy TTS audio download failed: ${audioResponse.status}`, - ); + throw new TtsError({ + code: "LEGACY_TTS_UPSTREAM", + message: `Legacy TTS audio download failed: ${audioResponse.status}`, + }); } const contentType = audioResponse.headers.get("content-type") ?? ""; @@ -81,15 +95,19 @@ export async function synthesizeSpeechWithLegacyProvider({ !mimeType.startsWith("audio/") && mimeType !== "application/octet-stream" ) { - throw new Error( - `Legacy TTS returned non-audio content: ${contentType || "(no content-type)"}`, - ); + throw new TtsError({ + code: "LEGACY_TTS_UPSTREAM", + message: `Legacy TTS returned non-audio content: ${contentType || "(no content-type)"}`, + }); } const audio = await audioResponse.arrayBuffer(); if (audio.byteLength === 0) { - throw new Error("Legacy TTS returned empty audio"); + throw new TtsError({ + code: "LEGACY_TTS_UPSTREAM", + message: "Legacy TTS returned empty audio", + }); } return audio; diff --git a/apps/web/src/lib/tts/openai-compatible.ts b/apps/web/src/lib/tts/openai-compatible.ts index ccea62c..2cd639b 100644 --- a/apps/web/src/lib/tts/openai-compatible.ts +++ b/apps/web/src/lib/tts/openai-compatible.ts @@ -4,6 +4,7 @@ import { DEFAULT_VOICE_PACK, } from "@/constants/tts-constants"; import { fetchWithTimeout, type FetchLike } from "./fetch-with-timeout"; +import { TtsError } from "./errors"; const externalTtsConfigSchema = z.object({ API_BASE_URL: z.string().min(1), @@ -28,7 +29,10 @@ export function getExternalTtsConfig({ const parsed = externalTtsConfigSchema.safeParse(env); if (!parsed.success) { - throw new Error("External TTS is not configured"); + throw new TtsError({ + code: "EXTERNAL_TTS_CONFIG", + message: "External TTS is not configured", + }); } const apiBaseUrl = parsed.data.API_BASE_URL.trim().replace(/\/+$/, ""); @@ -36,7 +40,10 @@ export function getExternalTtsConfig({ const model = parsed.data.API_MODEL.trim(); if (!apiBaseUrl || !apiKey || !model) { - throw new Error("External TTS is not configured"); + throw new TtsError({ + code: "EXTERNAL_TTS_CONFIG", + message: "External TTS is not configured", + }); } return { @@ -163,15 +170,19 @@ export async function synthesizeSpeechWithOpenAiCompatible({ !mimeType.startsWith("audio/") && mimeType !== "application/octet-stream" ) { - throw new Error( - `Expected audio response, received ${contentType || "(no content-type)"}`, - ); + throw new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: `Expected audio response, received ${contentType || "(no content-type)"}`, + }); } const audio = await response.arrayBuffer(); if (audio.byteLength === 0) { - throw new Error("External TTS returned empty audio"); + throw new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: "External TTS returned empty audio", + }); } return audio; @@ -184,9 +195,10 @@ export async function synthesizeSpeechWithOpenAiCompatible({ } } - throw new Error( - `External TTS request failed: ${await getUpstreamErrorMessage({ + throw new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: `External TTS request failed: ${await getUpstreamErrorMessage({ response: lastErrorResponse ?? new Response(null, { status: 500 }), })}`, - ); + }); } diff --git a/apps/web/src/lib/tts/provider.test.ts b/apps/web/src/lib/tts/provider.test.ts index 561076d..06dd5a1 100644 --- a/apps/web/src/lib/tts/provider.test.ts +++ b/apps/web/src/lib/tts/provider.test.ts @@ -74,4 +74,17 @@ describe("synthesizeSpeechWithFallback", () => { expect(openAiCalled).toBe(false); expect(legacyCalled).toBe(false); }); + + test("rethrows missing external config with a structured error code", async () => { + await expect( + synthesizeSpeechWithFallback({ + env: {}, + text: "hello", + voice: "default", + }), + ).rejects.toMatchObject({ + code: "EXTERNAL_TTS_CONFIG", + message: "External TTS is not configured", + }); + }); }); diff --git a/apps/web/src/lib/tts/provider.ts b/apps/web/src/lib/tts/provider.ts index 9a0a02f..39f0669 100644 --- a/apps/web/src/lib/tts/provider.ts +++ b/apps/web/src/lib/tts/provider.ts @@ -2,6 +2,7 @@ import { getExternalTtsConfig, synthesizeSpeechWithOpenAiCompatible, } from "./openai-compatible"; +import { isTtsError } from "./errors"; import { synthesizeSpeechWithLegacyProvider } from "./legacy"; type TtsEnv = { @@ -27,10 +28,7 @@ export async function synthesizeSpeechWithFallback({ const config = getExternalTtsConfig({ env }); return await openAiSynthesize({ config, text, voice }); } catch (error) { - if ( - error instanceof Error && - error.message === "External TTS is not configured" - ) { + if (isTtsError(error) && error.code === "EXTERNAL_TTS_CONFIG") { throw error; } From 44fdfa5d1e9e013ac545473445b2caef43777399 Mon Sep 17 00:00:00 2001 From: tianhei Date: Thu, 19 Mar 2026 03:16:43 +0800 Subject: [PATCH 07/22] fix(tts): classify timeout failures as upstream errors Summary: - wrap external and legacy timeout failures in structured upstream TTS errors - add route success coverage and an exhaustive TTS error switch guard - tighten timeout regression tests to assert error codes instead of messages Rationale: - keep upstream timeouts on the 502 path instead of falling through to 500 - preserve readable timeout messages while restoring consistent route behavior - close the latest review round with direct regression coverage for the timeout path Tests: - bun test apps/web/src/lib/tts/fetch-with-timeout.test.ts apps/web/src/lib/tts/provider.test.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/legacy.test.ts apps/web/src/app/api/tts/generate/route.test.ts - bunx @biomejs/biome check apps/web/src/app/api/tts/generate/route.ts apps/web/src/app/api/tts/generate/route.test.ts apps/web/src/lib/tts/fetch-with-timeout.ts apps/web/src/lib/tts/fetch-with-timeout.test.ts apps/web/src/lib/tts/openai-compatible.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/legacy.ts apps/web/src/lib/tts/legacy.test.ts apps/web/src/lib/tts/provider.ts apps/web/src/lib/tts/provider.test.ts apps/web/src/lib/tts/errors.ts apps/web/src/constants/tts-constants.ts packages/env/src/web.ts docs/plans/2026-03-17-tts-external-provider.md docs/plans/2026-03-17-tts-external-provider-design.md - bunx tsc -p apps/web/tsconfig.json --noEmit - NODE_ENV=development NEXT_PUBLIC_SITE_URL=http://localhost:4311 UPSTASH_REDIS_REST_URL=https://example.com UPSTASH_REDIS_REST_TOKEN=dummy bun --eval 'import { NextRequest } from "next/server"; import { POST } from "./src/app/api/tts/generate/route.ts"; /* status=200 audioHead=SUQzBAAA */' Co-authored-by: Codex --- .../src/app/api/tts/generate/route.test.ts | 9 ++++ apps/web/src/app/api/tts/generate/route.ts | 4 ++ apps/web/src/lib/tts/legacy.test.ts | 10 +++- apps/web/src/lib/tts/legacy.ts | 48 ++++++++++++++----- .../web/src/lib/tts/openai-compatible.test.ts | 5 +- apps/web/src/lib/tts/openai-compatible.ts | 32 ++++++++++--- 6 files changed, 86 insertions(+), 22 deletions(-) diff --git a/apps/web/src/app/api/tts/generate/route.test.ts b/apps/web/src/app/api/tts/generate/route.test.ts index a2849f8..cdf43b8 100644 --- a/apps/web/src/app/api/tts/generate/route.test.ts +++ b/apps/web/src/app/api/tts/generate/route.test.ts @@ -38,6 +38,15 @@ describe("POST /api/tts/generate", () => { console.error = originalConsoleError; }); + test("returns base64 audio for successful synthesis", async () => { + const response = await POST(createRequest({ text: "hello" }) as never); + + expect(response.status).toBe(200); + expect(await response.json()).toEqual({ + audio: "AQID", + }); + }); + test("returns 502 for structured legacy upstream errors without relying on message prefixes", async () => { synthesizeImpl = async () => { throw Object.assign(new Error("legacy fallback audio download failed"), { diff --git a/apps/web/src/app/api/tts/generate/route.ts b/apps/web/src/app/api/tts/generate/route.ts index 5e70366..82ffde3 100644 --- a/apps/web/src/app/api/tts/generate/route.ts +++ b/apps/web/src/app/api/tts/generate/route.ts @@ -44,6 +44,10 @@ export async function POST(request: NextRequest) { case "EXTERNAL_TTS_UPSTREAM": case "LEGACY_TTS_UPSTREAM": return NextResponse.json({ error: message }, { status: 502 }); + default: { + const exhaustiveCode: never = error.code; + throw new Error(`Unhandled TTS error code: ${exhaustiveCode}`); + } } } diff --git a/apps/web/src/lib/tts/legacy.test.ts b/apps/web/src/lib/tts/legacy.test.ts index 6ece491..37e4afd 100644 --- a/apps/web/src/lib/tts/legacy.test.ts +++ b/apps/web/src/lib/tts/legacy.test.ts @@ -116,7 +116,10 @@ describe("synthesizeSpeechWithLegacyProvider", () => { ); }), }), - ).rejects.toThrow("Legacy TTS request timed out"); + ).rejects.toMatchObject({ + code: "LEGACY_TTS_UPSTREAM", + message: "Legacy TTS request timed out", + }); }); test("aborts the audio download when the legacy audio fetch hangs", async () => { @@ -145,6 +148,9 @@ describe("synthesizeSpeechWithLegacyProvider", () => { }); }, }), - ).rejects.toThrow("Legacy TTS audio download timed out"); + ).rejects.toMatchObject({ + code: "LEGACY_TTS_UPSTREAM", + message: "Legacy TTS audio download timed out", + }); }); }); diff --git a/apps/web/src/lib/tts/legacy.ts b/apps/web/src/lib/tts/legacy.ts index 72bd5c2..3f575e9 100644 --- a/apps/web/src/lib/tts/legacy.ts +++ b/apps/web/src/lib/tts/legacy.ts @@ -12,6 +12,18 @@ const legacyResponseSchema = z.object({ url: z.string().url(), }); +function wrapLegacyUpstreamError({ error }: { error: unknown }): TtsError { + if (error instanceof TtsError) { + return error; + } + + return new TtsError({ + code: "LEGACY_TTS_UPSTREAM", + message: + error instanceof Error ? error.message : "Legacy TTS generation failed", + }); +} + export async function synthesizeSpeechWithLegacyProvider({ text, voice: _voice, @@ -38,12 +50,18 @@ export async function synthesizeSpeechWithLegacyProvider({ }); } - const upstreamResponse = await fetchWithTimeout({ - fetchImpl, - input: upstreamUrl, - timeoutMessage: "Legacy TTS request timed out", - timeoutMs, - }); + let upstreamResponse: Response; + + try { + upstreamResponse = await fetchWithTimeout({ + fetchImpl, + input: upstreamUrl, + timeoutMessage: "Legacy TTS request timed out", + timeoutMs, + }); + } catch (error) { + throw wrapLegacyUpstreamError({ error }); + } if (!upstreamResponse.ok) { throw new TtsError({ @@ -74,12 +92,18 @@ export async function synthesizeSpeechWithLegacyProvider({ }); } - const audioResponse = await fetchWithTimeout({ - fetchImpl, - input: audioUrl, - timeoutMessage: "Legacy TTS audio download timed out", - timeoutMs, - }); + let audioResponse: Response; + + try { + audioResponse = await fetchWithTimeout({ + fetchImpl, + input: audioUrl, + timeoutMessage: "Legacy TTS audio download timed out", + timeoutMs, + }); + } catch (error) { + throw wrapLegacyUpstreamError({ error }); + } if (!audioResponse.ok) { throw new TtsError({ diff --git a/apps/web/src/lib/tts/openai-compatible.test.ts b/apps/web/src/lib/tts/openai-compatible.test.ts index 58883cc..05872f7 100644 --- a/apps/web/src/lib/tts/openai-compatible.test.ts +++ b/apps/web/src/lib/tts/openai-compatible.test.ts @@ -242,7 +242,10 @@ describe("synthesizeSpeechWithOpenAiCompatible", () => { ); }), }), - ).rejects.toThrow("External TTS request timed out"); + ).rejects.toMatchObject({ + code: "EXTERNAL_TTS_UPSTREAM", + message: "External TTS request timed out", + }); }); test("surfaces upstream text errors when JSON is unavailable", async () => { diff --git a/apps/web/src/lib/tts/openai-compatible.ts b/apps/web/src/lib/tts/openai-compatible.ts index 2cd639b..93ec77d 100644 --- a/apps/web/src/lib/tts/openai-compatible.ts +++ b/apps/web/src/lib/tts/openai-compatible.ts @@ -21,6 +21,18 @@ export interface ExternalTtsConfig { model: string; } +function wrapExternalUpstreamError({ error }: { error: unknown }): TtsError { + if (error instanceof TtsError) { + return error; + } + + return new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: + error instanceof Error ? error.message : "External TTS request failed", + }); +} + export function getExternalTtsConfig({ env, }: { @@ -154,13 +166,19 @@ export async function synthesizeSpeechWithOpenAiCompatible({ let lastErrorResponse: Response | null = null; for (const endpointUrl of endpointUrls) { - const response = await fetchWithTimeout({ - fetchImpl, - init: requestInit, - input: endpointUrl, - timeoutMessage: "External TTS request timed out", - timeoutMs, - }); + let response: Response; + + try { + response = await fetchWithTimeout({ + fetchImpl, + init: requestInit, + input: endpointUrl, + timeoutMessage: "External TTS request timed out", + timeoutMs, + }); + } catch (error) { + throw wrapExternalUpstreamError({ error }); + } if (response.ok) { const contentType = response.headers.get("content-type") ?? ""; From eaa3798f55fc01a2d65242b0ce154b7d837e33da Mon Sep 17 00:00:00 2001 From: tianhei Date: Thu, 19 Mar 2026 03:47:37 +0800 Subject: [PATCH 08/22] fix(tts): narrow fallback and refresh design context Summary: - limit legacy fallback to structured EXTERNAL_TTS_UPSTREAM failures only - switch route tests to real TtsError instances and add unexpected-error coverage - remove orchestration-only design doc text and mark background as pre-change context Rationale: - surface unexpected external provider bugs instead of silently masking them - keep test fixtures aligned with production error handling paths - make the design doc describe architecture rather than orchestration metadata Tests: - bun test apps/web/src/lib/tts/provider.test.ts apps/web/src/app/api/tts/generate/route.test.ts - bun test apps/web/src/lib/tts/fetch-with-timeout.test.ts apps/web/src/lib/tts/provider.test.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/legacy.test.ts apps/web/src/app/api/tts/generate/route.test.ts - bunx @biomejs/biome check apps/web/src/app/api/tts/generate/route.ts apps/web/src/app/api/tts/generate/route.test.ts apps/web/src/lib/tts/fetch-with-timeout.ts apps/web/src/lib/tts/fetch-with-timeout.test.ts apps/web/src/lib/tts/openai-compatible.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/legacy.ts apps/web/src/lib/tts/legacy.test.ts apps/web/src/lib/tts/provider.ts apps/web/src/lib/tts/provider.test.ts apps/web/src/lib/tts/errors.ts apps/web/src/constants/tts-constants.ts packages/env/src/web.ts docs/plans/2026-03-17-tts-external-provider.md docs/plans/2026-03-17-tts-external-provider-design.md - bunx tsc -p apps/web/tsconfig.json --noEmit - NODE_ENV=development NEXT_PUBLIC_SITE_URL=http://localhost:4311 UPSTASH_REDIS_REST_URL=https://example.com UPSTASH_REDIS_REST_TOKEN=dummy bun --eval 'import { NextRequest } from "next/server"; import { POST } from "./src/app/api/tts/generate/route.ts"; /* status=200 audioHead=SUQzBAAA */' Co-authored-by: Codex --- .../src/app/api/tts/generate/route.test.ts | 10 ++++-- apps/web/src/lib/tts/provider.test.ts | 36 ++++++++++++++++--- apps/web/src/lib/tts/provider.ts | 4 +++ ...2026-03-17-tts-external-provider-design.md | 8 ++--- 4 files changed, 46 insertions(+), 12 deletions(-) diff --git a/apps/web/src/app/api/tts/generate/route.test.ts b/apps/web/src/app/api/tts/generate/route.test.ts index cdf43b8..27987a8 100644 --- a/apps/web/src/app/api/tts/generate/route.test.ts +++ b/apps/web/src/app/api/tts/generate/route.test.ts @@ -1,4 +1,5 @@ import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test"; +import { TtsError } from "@/lib/tts/errors"; let synthesizeImpl: typeof import("@/lib/tts/provider").synthesizeSpeechWithFallback; const originalConsoleError = console.error; @@ -49,8 +50,9 @@ describe("POST /api/tts/generate", () => { test("returns 502 for structured legacy upstream errors without relying on message prefixes", async () => { synthesizeImpl = async () => { - throw Object.assign(new Error("legacy fallback audio download failed"), { + throw new TtsError({ code: "LEGACY_TTS_UPSTREAM", + message: "legacy fallback audio download failed", }); }; @@ -64,8 +66,9 @@ describe("POST /api/tts/generate", () => { test("returns 502 for structured external upstream errors without relying on message prefixes", async () => { synthesizeImpl = async () => { - throw Object.assign(new Error("upstream gateway timeout"), { + throw new TtsError({ code: "EXTERNAL_TTS_UPSTREAM", + message: "upstream gateway timeout", }); }; @@ -79,8 +82,9 @@ describe("POST /api/tts/generate", () => { test("returns the original config error message for structured config failures", async () => { synthesizeImpl = async () => { - throw Object.assign(new Error("external config missing"), { + throw new TtsError({ code: "EXTERNAL_TTS_CONFIG", + message: "external config missing", }); }; diff --git a/apps/web/src/lib/tts/provider.test.ts b/apps/web/src/lib/tts/provider.test.ts index 06dd5a1..8e77f06 100644 --- a/apps/web/src/lib/tts/provider.test.ts +++ b/apps/web/src/lib/tts/provider.test.ts @@ -1,4 +1,5 @@ import { describe, expect, test } from "bun:test"; +import { TtsError } from "./errors"; import { synthesizeSpeechWithFallback } from "./provider"; describe("synthesizeSpeechWithFallback", () => { @@ -24,7 +25,7 @@ describe("synthesizeSpeechWithFallback", () => { expect(legacyCalled).toBe(false); }); - test("falls back to the legacy provider when the configured provider is unsupported", async () => { + test("falls back to the legacy provider for structured external upstream errors", async () => { let legacyCalled = false; const result = await synthesizeSpeechWithFallback({ @@ -36,9 +37,11 @@ describe("synthesizeSpeechWithFallback", () => { text: "hello", voice: "default", openAiSynthesize: async () => { - throw new Error( - "External TTS request failed: Expected audio response, received text/html; charset=utf-8", - ); + throw new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: + "External TTS request failed: Expected audio response, received text/html; charset=utf-8", + }); }, legacySynthesize: async ({ text }) => { legacyCalled = true; @@ -51,6 +54,31 @@ describe("synthesizeSpeechWithFallback", () => { expect(legacyCalled).toBe(true); }); + test("rethrows unexpected external provider errors instead of silently falling back", async () => { + let legacyCalled = false; + + await expect( + synthesizeSpeechWithFallback({ + env: { + API_BASE_URL: "https://example.com/v1", + API_MODEL: "tts-1", + API_KEY: "secret", + }, + text: "hello", + voice: "default", + openAiSynthesize: async () => { + throw new Error("unexpected provider failure"); + }, + legacySynthesize: async () => { + legacyCalled = true; + return Uint8Array.from([7, 8, 9]).buffer; + }, + }), + ).rejects.toThrow("unexpected provider failure"); + + expect(legacyCalled).toBe(false); + }); + test("rethrows missing external config instead of silently falling back", async () => { let openAiCalled = false; let legacyCalled = false; diff --git a/apps/web/src/lib/tts/provider.ts b/apps/web/src/lib/tts/provider.ts index 39f0669..7165ce8 100644 --- a/apps/web/src/lib/tts/provider.ts +++ b/apps/web/src/lib/tts/provider.ts @@ -32,6 +32,10 @@ export async function synthesizeSpeechWithFallback({ throw error; } + if (!isTtsError(error) || error.code !== "EXTERNAL_TTS_UPSTREAM") { + throw error; + } + return legacySynthesize({ text, voice }); } } diff --git a/docs/plans/2026-03-17-tts-external-provider-design.md b/docs/plans/2026-03-17-tts-external-provider-design.md index 9638aec..6468645 100644 --- a/docs/plans/2026-03-17-tts-external-provider-design.md +++ b/docs/plans/2026-03-17-tts-external-provider-design.md @@ -4,15 +4,13 @@ `TIA-51` 要求 Cutia 支持调用外部 TTS API,把文本或对话内容生成语音并接入视频编辑流程。 -当前仓库已经有一条从文本元素生成语音并插入时间线的链路,但服务端路由 `apps/web/src/app/api/tts/generate/route.ts` 仍然把上游 TTS 服务硬编码为单一匿名接口,`voice` 参数也没有被真正消费。这意味着: +改造前,仓库已经有一条从文本元素生成语音并插入时间线的链路,但服务端路由 `apps/web/src/app/api/tts/generate/route.ts` 在上游扩展能力和 `voice` 语义上都存在限制。这意味着: - 外部 TTS 提供方无法通过环境配置切换 -- 语音选项只是前端占位,实际不会影响生成结果 +- `voice` 参数虽然已经沿链路透传,但缺少与外部 provider 对齐的清晰适配语义 - 错误语义受限于硬编码上游,缺少可维护的适配层 -## 无人值守前提 - -本次执行为无人值守编排,会直接根据工单描述和运行环境中的 `API_BASE_URL`、`API_MODEL`、`API_KEY` 做保守设计,不额外等待人工确认。 +后续设计会在保留既有编辑器接入方式的前提下,把 provider 配置、fallback 与错误适配层补齐为可维护结构。 ## 目标 From 60ccc1000a9c44c6ebc235791e53f4446312898d Mon Sep 17 00:00:00 2001 From: tianhei Date: Thu, 19 Mar 2026 23:26:29 +0800 Subject: [PATCH 09/22] fix(tts): close rework gaps in provider fallback Summary: - harden legacy audio downloads against unsafe redirects and add regression tests for redirect validation - preserve retryability metadata for external upstream failures so provider fallback skips non-retryable contract errors - expand timeout, provider, and route coverage and refresh the design note for the current fallback behavior Rationale: - CodeRabbit rework flagged redirect bypass and silent fallback cases that could hide the real external provider failure - the live probe showed a 200 text/html response was still falling through to legacy, so the external error needed to remain terminal - the extra tests keep the rework fixes pinned to the actual failure modes seen in review and runtime validation Tests: - bun test apps/web/src/lib/tts/fetch-with-timeout.test.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/legacy.test.ts apps/web/src/app/api/tts/generate/route.test.ts - bun test apps/web/src/lib/tts/fetch-with-timeout.test.ts apps/web/src/lib/tts/provider.test.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/legacy.test.ts apps/web/src/app/api/tts/generate/route.test.ts - bunx @biomejs/biome check apps/web/src/app/api/tts/generate/route.ts apps/web/src/app/api/tts/generate/route.test.ts apps/web/src/lib/tts/fetch-with-timeout.ts apps/web/src/lib/tts/fetch-with-timeout.test.ts apps/web/src/lib/tts/openai-compatible.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/legacy.ts apps/web/src/lib/tts/legacy.test.ts apps/web/src/lib/tts/provider.ts apps/web/src/lib/tts/provider.test.ts docs/plans/2026-03-17-tts-external-provider-design.md - bunx tsc -p apps/web/tsconfig.json --noEmit - NODE_ENV=development NEXT_PUBLIC_SITE_URL=http://localhost:4311 UPSTASH_REDIS_REST_URL=https://example.com UPSTASH_REDIS_REST_TOKEN=dummy bun --eval '...route probe...' Co-authored-by: Codex --- .../src/app/api/tts/generate/route.test.ts | 13 ++-- apps/web/src/lib/tts/errors.ts | 8 ++ .../src/lib/tts/fetch-with-timeout.test.ts | 36 +++++++++ apps/web/src/lib/tts/legacy.test.ts | 32 ++++++++ apps/web/src/lib/tts/legacy.ts | 37 +++++++++ .../web/src/lib/tts/openai-compatible.test.ts | 36 +++++++++ apps/web/src/lib/tts/openai-compatible.ts | 24 ++++++ apps/web/src/lib/tts/provider.test.ts | 76 +++++++++++++++++++ apps/web/src/lib/tts/provider.ts | 6 +- ...2026-03-17-tts-external-provider-design.md | 6 +- 10 files changed, 265 insertions(+), 9 deletions(-) diff --git a/apps/web/src/app/api/tts/generate/route.test.ts b/apps/web/src/app/api/tts/generate/route.test.ts index 27987a8..77485f1 100644 --- a/apps/web/src/app/api/tts/generate/route.test.ts +++ b/apps/web/src/app/api/tts/generate/route.test.ts @@ -1,5 +1,6 @@ import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test"; import { TtsError } from "@/lib/tts/errors"; +import { NextRequest } from "next/server"; let synthesizeImpl: typeof import("@/lib/tts/provider").synthesizeSpeechWithFallback; const originalConsoleError = console.error; @@ -19,8 +20,8 @@ mock.module("@/lib/tts/provider", () => ({ const { POST } = await import("./route"); -function createRequest(body: unknown): Request { - return new Request("http://localhost/api/tts/generate", { +function createRequest(body: unknown): NextRequest { + return new NextRequest("http://localhost/api/tts/generate", { body: JSON.stringify(body), headers: { "content-type": "application/json", @@ -40,7 +41,7 @@ describe("POST /api/tts/generate", () => { }); test("returns base64 audio for successful synthesis", async () => { - const response = await POST(createRequest({ text: "hello" }) as never); + const response = await POST(createRequest({ text: "hello" })); expect(response.status).toBe(200); expect(await response.json()).toEqual({ @@ -56,7 +57,7 @@ describe("POST /api/tts/generate", () => { }); }; - const response = await POST(createRequest({ text: "hello" }) as never); + const response = await POST(createRequest({ text: "hello" })); expect(response.status).toBe(502); expect(await response.json()).toEqual({ @@ -72,7 +73,7 @@ describe("POST /api/tts/generate", () => { }); }; - const response = await POST(createRequest({ text: "hello" }) as never); + const response = await POST(createRequest({ text: "hello" })); expect(response.status).toBe(502); expect(await response.json()).toEqual({ @@ -88,7 +89,7 @@ describe("POST /api/tts/generate", () => { }); }; - const response = await POST(createRequest({ text: "hello" }) as never); + const response = await POST(createRequest({ text: "hello" })); expect(response.status).toBe(500); expect(await response.json()).toEqual({ diff --git a/apps/web/src/lib/tts/errors.ts b/apps/web/src/lib/tts/errors.ts index d3f7bc4..639b9ee 100644 --- a/apps/web/src/lib/tts/errors.ts +++ b/apps/web/src/lib/tts/errors.ts @@ -8,17 +8,25 @@ export type TtsErrorCode = (typeof TTS_ERROR_CODES)[number]; export class TtsError extends Error { code: TtsErrorCode; + retryable?: boolean; + status?: number; constructor({ code, message, + retryable, + status, }: { code: TtsErrorCode; message: string; + retryable?: boolean; + status?: number; }) { super(message); this.name = "TtsError"; this.code = code; + this.retryable = retryable; + this.status = status; } } diff --git a/apps/web/src/lib/tts/fetch-with-timeout.test.ts b/apps/web/src/lib/tts/fetch-with-timeout.test.ts index 880fc93..8652a1f 100644 --- a/apps/web/src/lib/tts/fetch-with-timeout.test.ts +++ b/apps/web/src/lib/tts/fetch-with-timeout.test.ts @@ -2,6 +2,24 @@ import { describe, expect, test } from "bun:test"; import { fetchWithTimeout } from "./fetch-with-timeout"; describe("fetchWithTimeout", () => { + test("resolves successfully when fetch completes before the timeout", async () => { + let fetchCalled = false; + + const response = await fetchWithTimeout({ + fetchImpl: async () => { + fetchCalled = true; + return new Response("ok", { status: 200 }); + }, + input: "https://example.com", + timeoutMessage: "timed out", + timeoutMs: 50, + }); + + expect(fetchCalled).toBe(true); + expect(response.status).toBe(200); + expect(await response.text()).toBe("ok"); + }); + test("rejects immediately when the caller signal is already aborted", async () => { const controller = new AbortController(); const callerError = new Error("caller aborted"); @@ -48,4 +66,22 @@ describe("fetchWithTimeout", () => { }), ).rejects.toThrow("caller aborted"); }); + + test("rejects with the timeout message when fetch exceeds timeoutMs", async () => { + await expect( + fetchWithTimeout({ + fetchImpl: async (_input, init) => + new Promise((_resolve, reject) => { + init?.signal?.addEventListener( + "abort", + () => reject(new Error("aborted")), + { once: true }, + ); + }), + input: "https://example.com", + timeoutMessage: "timed out", + timeoutMs: 10, + }), + ).rejects.toThrow("timed out"); + }); }); diff --git a/apps/web/src/lib/tts/legacy.test.ts b/apps/web/src/lib/tts/legacy.test.ts index 37e4afd..d78d637 100644 --- a/apps/web/src/lib/tts/legacy.test.ts +++ b/apps/web/src/lib/tts/legacy.test.ts @@ -83,6 +83,38 @@ describe("synthesizeSpeechWithLegacyProvider", () => { expect(Array.from(new Uint8Array(audio))).toEqual([1, 2, 3]); }); + test("rejects redirected audio downloads that leave the allowlist", async () => { + let sawManualRedirect = false; + + await expect( + synthesizeSpeechWithLegacyProvider({ + text: "hello", + fetchImpl: async (input, init) => { + if (String(input).includes("/apis/mbAIsc?")) { + return Response.json({ + code: 200, + url: "https://api.milorapart.top/voice/test.mp3", + }); + } + + sawManualRedirect = init?.redirect === "manual"; + + return new Response(null, { + status: 302, + headers: { + location: "https://evil.example.com/payload.mp3", + }, + }); + }, + }), + ).rejects.toMatchObject({ + code: "LEGACY_TTS_UPSTREAM", + message: "Legacy TTS audio download redirected to an unexpected host", + }); + + expect(sawManualRedirect).toBe(true); + }); + test("rejects synthesis text that would exceed the legacy GET limit", async () => { let fetchCalled = false; diff --git a/apps/web/src/lib/tts/legacy.ts b/apps/web/src/lib/tts/legacy.ts index 3f575e9..01d6092 100644 --- a/apps/web/src/lib/tts/legacy.ts +++ b/apps/web/src/lib/tts/legacy.ts @@ -12,6 +12,10 @@ const legacyResponseSchema = z.object({ url: z.string().url(), }); +function isRedirectStatus(status: number): boolean { + return status >= 300 && status < 400; +} + function wrapLegacyUpstreamError({ error }: { error: unknown }): TtsError { if (error instanceof TtsError) { return error; @@ -97,6 +101,7 @@ export async function synthesizeSpeechWithLegacyProvider({ try { audioResponse = await fetchWithTimeout({ fetchImpl, + init: { redirect: "manual" }, input: audioUrl, timeoutMessage: "Legacy TTS audio download timed out", timeoutMs, @@ -105,6 +110,38 @@ export async function synthesizeSpeechWithLegacyProvider({ throw wrapLegacyUpstreamError({ error }); } + if (isRedirectStatus(audioResponse.status)) { + const location = audioResponse.headers.get("location"); + + if (!location) { + throw new TtsError({ + code: "LEGACY_TTS_UPSTREAM", + message: `Legacy TTS audio download failed: ${audioResponse.status}`, + }); + } + + let redirectUrl: URL; + + try { + redirectUrl = new URL(location, audioUrl); + } catch { + throw new TtsError({ + code: "LEGACY_TTS_UPSTREAM", + message: "Legacy TTS audio download redirected to an invalid URL", + }); + } + + if ( + redirectUrl.protocol !== "https:" || + !LEGACY_TTS_ALLOWED_AUDIO_HOSTS.has(redirectUrl.hostname) + ) { + throw new TtsError({ + code: "LEGACY_TTS_UPSTREAM", + message: "Legacy TTS audio download redirected to an unexpected host", + }); + } + } + if (!audioResponse.ok) { throw new TtsError({ code: "LEGACY_TTS_UPSTREAM", diff --git a/apps/web/src/lib/tts/openai-compatible.test.ts b/apps/web/src/lib/tts/openai-compatible.test.ts index 05872f7..fdc90b2 100644 --- a/apps/web/src/lib/tts/openai-compatible.test.ts +++ b/apps/web/src/lib/tts/openai-compatible.test.ts @@ -44,6 +44,18 @@ describe("getExternalTtsConfig", () => { }), ).toThrow("External TTS is not configured"); }); + + test("rejects malformed API_BASE_URL values", () => { + expect(() => + getExternalTtsConfig({ + env: { + API_BASE_URL: "not-a-url", + API_MODEL: "tts-1", + API_KEY: "secret", + }, + }), + ).toThrow("External TTS is not configured"); + }); }); describe("synthesizeSpeechWithOpenAiCompatible", () => { @@ -267,6 +279,30 @@ describe("synthesizeSpeechWithOpenAiCompatible", () => { ).rejects.toThrow("gateway timeout"); }); + test("marks auth failures as non-retryable upstream errors", async () => { + await expect( + synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "default", + fetchImpl: async () => + Response.json( + { error: { message: "invalid api key" } }, + { status: 401, statusText: "Unauthorized" }, + ), + }), + ).rejects.toMatchObject({ + code: "EXTERNAL_TTS_UPSTREAM", + message: "External TTS request failed: invalid api key", + retryable: false, + status: 401, + }); + }); + test("falls back to the raw upstream body when JSON shape is unrecognized", async () => { await expect( synthesizeSpeechWithOpenAiCompatible({ diff --git a/apps/web/src/lib/tts/openai-compatible.ts b/apps/web/src/lib/tts/openai-compatible.ts index 93ec77d..b6f00cf 100644 --- a/apps/web/src/lib/tts/openai-compatible.ts +++ b/apps/web/src/lib/tts/openai-compatible.ts @@ -21,6 +21,14 @@ export interface ExternalTtsConfig { model: string; } +function isRetryableStatus(status: number | undefined): boolean { + if (status == null) { + return true; + } + + return status === 408 || status === 429 || status >= 500; +} + function wrapExternalUpstreamError({ error }: { error: unknown }): TtsError { if (error instanceof TtsError) { return error; @@ -30,6 +38,7 @@ function wrapExternalUpstreamError({ error }: { error: unknown }): TtsError { code: "EXTERNAL_TTS_UPSTREAM", message: error instanceof Error ? error.message : "External TTS request failed", + retryable: true, }); } @@ -58,6 +67,15 @@ export function getExternalTtsConfig({ }); } + try { + new URL(apiBaseUrl); + } catch { + throw new TtsError({ + code: "EXTERNAL_TTS_CONFIG", + message: "External TTS is not configured", + }); + } + return { apiBaseUrl, apiKey, @@ -191,6 +209,8 @@ export async function synthesizeSpeechWithOpenAiCompatible({ throw new TtsError({ code: "EXTERNAL_TTS_UPSTREAM", message: `Expected audio response, received ${contentType || "(no content-type)"}`, + retryable: false, + status: response.status, }); } @@ -200,6 +220,8 @@ export async function synthesizeSpeechWithOpenAiCompatible({ throw new TtsError({ code: "EXTERNAL_TTS_UPSTREAM", message: "External TTS returned empty audio", + retryable: false, + status: response.status, }); } @@ -218,5 +240,7 @@ export async function synthesizeSpeechWithOpenAiCompatible({ message: `External TTS request failed: ${await getUpstreamErrorMessage({ response: lastErrorResponse ?? new Response(null, { status: 500 }), })}`, + retryable: isRetryableStatus(lastErrorResponse?.status), + status: lastErrorResponse?.status, }); } diff --git a/apps/web/src/lib/tts/provider.test.ts b/apps/web/src/lib/tts/provider.test.ts index 8e77f06..db2e0b7 100644 --- a/apps/web/src/lib/tts/provider.test.ts +++ b/apps/web/src/lib/tts/provider.test.ts @@ -1,5 +1,6 @@ import { describe, expect, test } from "bun:test"; import { TtsError } from "./errors"; +import { synthesizeSpeechWithOpenAiCompatible } from "./openai-compatible"; import { synthesizeSpeechWithFallback } from "./provider"; describe("synthesizeSpeechWithFallback", () => { @@ -79,6 +80,81 @@ describe("synthesizeSpeechWithFallback", () => { expect(legacyCalled).toBe(false); }); + test("rethrows non-retryable external upstream errors instead of falling back", async () => { + let legacyCalled = false; + + await expect( + synthesizeSpeechWithFallback({ + env: { + API_BASE_URL: "https://example.com/v1", + API_MODEL: "tts-1", + API_KEY: "secret", + }, + text: "hello", + voice: "default", + openAiSynthesize: async () => { + throw Object.assign( + new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: "External TTS request failed: invalid api key", + }), + { + retryable: false, + status: 401, + }, + ); + }, + legacySynthesize: async () => { + legacyCalled = true; + return Uint8Array.from([7, 8, 9]).buffer; + }, + }), + ).rejects.toMatchObject({ + code: "EXTERNAL_TTS_UPSTREAM", + retryable: false, + status: 401, + }); + + expect(legacyCalled).toBe(false); + }); + + test("does not fall back when the external provider returns a non-audio success response", async () => { + let legacyCalled = false; + + await expect( + synthesizeSpeechWithFallback({ + env: { + API_BASE_URL: "https://example.com/v1", + API_MODEL: "tts-1", + API_KEY: "secret", + }, + text: "hello", + voice: "default", + openAiSynthesize: ({ config, text, voice }) => + synthesizeSpeechWithOpenAiCompatible({ + config, + text, + voice, + fetchImpl: async () => + new Response("", { + status: 200, + headers: { "Content-Type": "text/html; charset=utf-8" }, + }), + }), + legacySynthesize: async () => { + legacyCalled = true; + return Uint8Array.from([7, 8, 9]).buffer; + }, + }), + ).rejects.toMatchObject({ + code: "EXTERNAL_TTS_UPSTREAM", + message: "Expected audio response, received text/html; charset=utf-8", + retryable: false, + }); + + expect(legacyCalled).toBe(false); + }); + test("rethrows missing external config instead of silently falling back", async () => { let openAiCalled = false; let legacyCalled = false; diff --git a/apps/web/src/lib/tts/provider.ts b/apps/web/src/lib/tts/provider.ts index 7165ce8..fd45996 100644 --- a/apps/web/src/lib/tts/provider.ts +++ b/apps/web/src/lib/tts/provider.ts @@ -32,7 +32,11 @@ export async function synthesizeSpeechWithFallback({ throw error; } - if (!isTtsError(error) || error.code !== "EXTERNAL_TTS_UPSTREAM") { + if ( + !isTtsError(error) || + error.code !== "EXTERNAL_TTS_UPSTREAM" || + error.retryable === false + ) { throw error; } diff --git a/docs/plans/2026-03-17-tts-external-provider-design.md b/docs/plans/2026-03-17-tts-external-provider-design.md index 6468645..3978d71 100644 --- a/docs/plans/2026-03-17-tts-external-provider-design.md +++ b/docs/plans/2026-03-17-tts-external-provider-design.md @@ -103,8 +103,10 @@ - 请求参数非法:返回 `400` - TTS 环境变量缺失:返回 `500`,信息明确为未配置 -- 外部 TTS 返回非 2xx:返回 `502`,透出可读错误 -- 外部 TTS 返回空音频或异常格式:返回 `502` +- 外部 provider 返回可重试的 `EXTERNAL_TTS_UPSTREAM`(例如超时、`429`、`5xx`)时,会先回退到 legacy provider + - legacy 回退成功:最终仍可能返回 `200` + - legacy 回退失败:最终返回 `502` +- 外部 provider 返回不可重试的 `EXTERNAL_TTS_UPSTREAM`(例如 `401`/`403`/`404`、空音频、非音频响应)时:直接返回 `502` - 未知异常:返回 `500` ## 测试策略 From b5dc0302132c52dc01722b65443b35b6a3655c0d Mon Sep 17 00:00:00 2001 From: tianhei Date: Thu, 19 Mar 2026 23:45:57 +0800 Subject: [PATCH 10/22] fix(tts): handle legacy redirects and namespaced config Summary: - follow allowlisted legacy audio redirects instead of failing after validating the target URL - prefer EXTERNAL_TTS_API_* config with API_* aliases kept for backward-compatible migration - update provider integration tests and TTS docs to reflect the new redirect and config behavior Rationale: - the latest PR review caught a real regression where safe legacy redirects could never succeed under redirect: manual - namespaced TTS env keys reduce collisions with unrelated API config while preserving the existing unattended environment - keeping docs and tests aligned prevents this review feedback from recurring in later rework attempts Tests: - bun test apps/web/src/lib/tts/legacy.test.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/provider.test.ts - bun test apps/web/src/lib/tts/fetch-with-timeout.test.ts apps/web/src/lib/tts/provider.test.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/legacy.test.ts apps/web/src/app/api/tts/generate/route.test.ts - bunx @biomejs/biome check apps/web/src/app/api/tts/generate/route.ts apps/web/src/app/api/tts/generate/route.test.ts apps/web/src/lib/tts/fetch-with-timeout.ts apps/web/src/lib/tts/fetch-with-timeout.test.ts apps/web/src/lib/tts/openai-compatible.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/legacy.ts apps/web/src/lib/tts/legacy.test.ts apps/web/src/lib/tts/provider.ts apps/web/src/lib/tts/provider.test.ts packages/env/src/web.ts docs/plans/2026-03-17-tts-external-provider.md docs/plans/2026-03-17-tts-external-provider-design.md - bunx tsc -p apps/web/tsconfig.json --noEmit Co-authored-by: Codex --- apps/web/src/lib/tts/legacy.test.ts | 40 +++++++++++++++++++ apps/web/src/lib/tts/legacy.ts | 12 ++++++ .../web/src/lib/tts/openai-compatible.test.ts | 37 ++++++++++++++++- apps/web/src/lib/tts/openai-compatible.ts | 16 +++++++- apps/web/src/lib/tts/provider.test.ts | 6 +-- apps/web/src/lib/tts/provider.ts | 3 ++ ...2026-03-17-tts-external-provider-design.md | 5 ++- .../plans/2026-03-17-tts-external-provider.md | 3 +- packages/env/src/web.ts | 3 ++ 9 files changed, 117 insertions(+), 8 deletions(-) diff --git a/apps/web/src/lib/tts/legacy.test.ts b/apps/web/src/lib/tts/legacy.test.ts index d78d637..bbe9896 100644 --- a/apps/web/src/lib/tts/legacy.test.ts +++ b/apps/web/src/lib/tts/legacy.test.ts @@ -115,6 +115,46 @@ describe("synthesizeSpeechWithLegacyProvider", () => { expect(sawManualRedirect).toBe(true); }); + test("follows allowlisted redirects for legacy audio downloads", async () => { + let downloadCallCount = 0; + + const audio = await synthesizeSpeechWithLegacyProvider({ + text: "hello", + fetchImpl: async (input, init) => { + if (String(input).includes("/apis/mbAIsc?")) { + return Response.json({ + code: 200, + url: "https://api.milorapart.top/voice/test.mp3", + }); + } + + downloadCallCount++; + + if (downloadCallCount === 1) { + expect(init?.redirect).toBe("manual"); + + return new Response(null, { + status: 302, + headers: { + location: "https://api.milorapart.top/voice/test-redirected.mp3", + }, + }); + } + + expect(String(input)).toBe( + "https://api.milorapart.top/voice/test-redirected.mp3", + ); + return new Response(Uint8Array.from([4, 5, 6]), { + status: 200, + headers: { "Content-Type": "audio/mpeg" }, + }); + }, + }); + + expect(downloadCallCount).toBe(2); + expect(Array.from(new Uint8Array(audio))).toEqual([4, 5, 6]); + }); + test("rejects synthesis text that would exceed the legacy GET limit", async () => { let fetchCalled = false; diff --git a/apps/web/src/lib/tts/legacy.ts b/apps/web/src/lib/tts/legacy.ts index 01d6092..f0e8212 100644 --- a/apps/web/src/lib/tts/legacy.ts +++ b/apps/web/src/lib/tts/legacy.ts @@ -140,6 +140,18 @@ export async function synthesizeSpeechWithLegacyProvider({ message: "Legacy TTS audio download redirected to an unexpected host", }); } + + try { + audioResponse = await fetchWithTimeout({ + fetchImpl, + init: { redirect: "error" }, + input: redirectUrl, + timeoutMessage: "Legacy TTS audio download timed out", + timeoutMs, + }); + } catch (error) { + throw wrapLegacyUpstreamError({ error }); + } } if (!audioResponse.ok) { diff --git a/apps/web/src/lib/tts/openai-compatible.test.ts b/apps/web/src/lib/tts/openai-compatible.test.ts index fdc90b2..366065f 100644 --- a/apps/web/src/lib/tts/openai-compatible.test.ts +++ b/apps/web/src/lib/tts/openai-compatible.test.ts @@ -6,7 +6,23 @@ import { } from "./openai-compatible"; describe("getExternalTtsConfig", () => { - test("reads required config from environment", () => { + test("reads namespaced TTS config from environment", () => { + const config = getExternalTtsConfig({ + env: { + EXTERNAL_TTS_API_BASE_URL: "https://example.com/v1/", + EXTERNAL_TTS_API_MODEL: "tts-1", + EXTERNAL_TTS_API_KEY: "secret", + }, + }); + + expect(config).toEqual({ + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }); + }); + + test("falls back to legacy API_* aliases when namespaced TTS config is absent", () => { const config = getExternalTtsConfig({ env: { API_BASE_URL: "https://example.com/v1/", @@ -22,6 +38,25 @@ describe("getExternalTtsConfig", () => { }); }); + test("prefers namespaced TTS config over legacy aliases", () => { + const config = getExternalTtsConfig({ + env: { + API_BASE_URL: "https://legacy.example.com/v1/", + API_MODEL: "legacy-tts", + API_KEY: "legacy-secret", + EXTERNAL_TTS_API_BASE_URL: "https://example.com/v1/", + EXTERNAL_TTS_API_MODEL: "tts-1", + EXTERNAL_TTS_API_KEY: "secret", + }, + }); + + expect(config).toEqual({ + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }); + }); + test("throws a clear error when config is incomplete", () => { expect(() => getExternalTtsConfig({ diff --git a/apps/web/src/lib/tts/openai-compatible.ts b/apps/web/src/lib/tts/openai-compatible.ts index b6f00cf..8fb42d0 100644 --- a/apps/web/src/lib/tts/openai-compatible.ts +++ b/apps/web/src/lib/tts/openai-compatible.ts @@ -21,6 +21,18 @@ export interface ExternalTtsConfig { model: string; } +function resolveExternalTtsEnv({ + env, +}: { + env: Record; +}): Record<"API_BASE_URL" | "API_MODEL" | "API_KEY", string | undefined> { + return { + API_BASE_URL: env.EXTERNAL_TTS_API_BASE_URL ?? env.API_BASE_URL, + API_MODEL: env.EXTERNAL_TTS_API_MODEL ?? env.API_MODEL, + API_KEY: env.EXTERNAL_TTS_API_KEY ?? env.API_KEY, + }; +} + function isRetryableStatus(status: number | undefined): boolean { if (status == null) { return true; @@ -47,7 +59,9 @@ export function getExternalTtsConfig({ }: { env: Record; }): ExternalTtsConfig { - const parsed = externalTtsConfigSchema.safeParse(env); + const parsed = externalTtsConfigSchema.safeParse( + resolveExternalTtsEnv({ env }), + ); if (!parsed.success) { throw new TtsError({ diff --git a/apps/web/src/lib/tts/provider.test.ts b/apps/web/src/lib/tts/provider.test.ts index db2e0b7..627f9d1 100644 --- a/apps/web/src/lib/tts/provider.test.ts +++ b/apps/web/src/lib/tts/provider.test.ts @@ -9,9 +9,9 @@ describe("synthesizeSpeechWithFallback", () => { const result = await synthesizeSpeechWithFallback({ env: { - API_BASE_URL: "https://example.com/v1", - API_MODEL: "tts-1", - API_KEY: "secret", + EXTERNAL_TTS_API_BASE_URL: "https://example.com/v1", + EXTERNAL_TTS_API_MODEL: "tts-1", + EXTERNAL_TTS_API_KEY: "secret", }, text: "hello", voice: "default", diff --git a/apps/web/src/lib/tts/provider.ts b/apps/web/src/lib/tts/provider.ts index fd45996..39d1c37 100644 --- a/apps/web/src/lib/tts/provider.ts +++ b/apps/web/src/lib/tts/provider.ts @@ -9,6 +9,9 @@ type TtsEnv = { API_BASE_URL?: string; API_MODEL?: string; API_KEY?: string; + EXTERNAL_TTS_API_BASE_URL?: string; + EXTERNAL_TTS_API_MODEL?: string; + EXTERNAL_TTS_API_KEY?: string; }; export async function synthesizeSpeechWithFallback({ diff --git a/docs/plans/2026-03-17-tts-external-provider-design.md b/docs/plans/2026-03-17-tts-external-provider-design.md index 3978d71..cadce47 100644 --- a/docs/plans/2026-03-17-tts-external-provider-design.md +++ b/docs/plans/2026-03-17-tts-external-provider-design.md @@ -43,7 +43,8 @@ 优点: - 只需要一层薄适配,即可支持大量 OpenAI 兼容的 TTS 服务 -- 和当前运行环境提供的 `API_BASE_URL`、`API_MODEL`、`API_KEY` 直接对齐 +- 优先使用 `EXTERNAL_TTS_API_BASE_URL`、`EXTERNAL_TTS_API_MODEL`、`EXTERNAL_TTS_API_KEY` +- 兼容读取当前运行环境里的 `API_BASE_URL`、`API_MODEL`、`API_KEY` 作为迁移别名 - 前端接口保持不变,编辑器链路改动最小 缺点: @@ -121,7 +122,7 @@ ### 真实验证 -- 使用环境中的真实 `API_BASE_URL`、`API_MODEL`、`API_KEY` +- 使用环境中的真实 TTS 配置验证,优先为 `EXTERNAL_TTS_API_*`,没有时回退到 `API_*` - 直接运行一次服务端适配逻辑,验证能拿到非空 MP3 数据 ## 风险与缓解 diff --git a/docs/plans/2026-03-17-tts-external-provider.md b/docs/plans/2026-03-17-tts-external-provider.md index 34a07e7..c05f40d 100644 --- a/docs/plans/2026-03-17-tts-external-provider.md +++ b/docs/plans/2026-03-17-tts-external-provider.md @@ -27,7 +27,8 @@ Expected: FAIL,原因是测试文件或实现不存在。 **Step 3: 为后续实现准备最小配置面** -- 在环境 schema 中加入 `API_BASE_URL`、`API_MODEL`、`API_KEY` +- 在环境 schema 中加入 `EXTERNAL_TTS_API_BASE_URL`、`EXTERNAL_TTS_API_MODEL`、`EXTERNAL_TTS_API_KEY` +- 兼容读取旧的 `API_BASE_URL`、`API_MODEL`、`API_KEY`,用于迁移与当前共享环境 - 在 TTS 常量中定义默认 voice 与可选 voice 列表 **Step 4: 运行定向测试** diff --git a/packages/env/src/web.ts b/packages/env/src/web.ts index d0ffdbc..c40efc5 100644 --- a/packages/env/src/web.ts +++ b/packages/env/src/web.ts @@ -21,6 +21,9 @@ const webEnvSchema = z.object({ UPSTASH_REDIS_REST_TOKEN: z.string(), FREESOUND_CLIENT_ID: z.string().optional(), FREESOUND_API_KEY: z.string().optional(), + EXTERNAL_TTS_API_BASE_URL: z.string().optional(), + EXTERNAL_TTS_API_MODEL: z.string().optional(), + EXTERNAL_TTS_API_KEY: z.string().optional(), API_BASE_URL: z.string().optional(), API_MODEL: z.string().optional(), API_KEY: z.string().optional(), From 5e233ff1f8bfac5472371fea26b76fccb4a42830 Mon Sep 17 00:00:00 2001 From: tianhei Date: Thu, 19 Mar 2026 23:50:28 +0800 Subject: [PATCH 11/22] test(tts): cover fallback 500 path and clean fixtures Summary: - add a route test for unexpected non-TtsError failures so the generic 500 fallback path stays covered - simplify provider test fixtures by constructing TtsError directly instead of mutating the instance after creation Rationale: - the latest review only left test-layer gaps, but they still matter for future refactors around error handling - using the constructor directly keeps the fixture idiomatic and matches the runtime shape more clearly Tests: - bun test apps/web/src/lib/tts/provider.test.ts apps/web/src/app/api/tts/generate/route.test.ts - bun test apps/web/src/lib/tts/fetch-with-timeout.test.ts apps/web/src/lib/tts/provider.test.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/legacy.test.ts apps/web/src/app/api/tts/generate/route.test.ts - bunx @biomejs/biome check apps/web/src/lib/tts/provider.test.ts apps/web/src/app/api/tts/generate/route.test.ts - bunx tsc -p apps/web/tsconfig.json --noEmit Co-authored-by: Codex --- apps/web/src/app/api/tts/generate/route.test.ts | 14 ++++++++++++++ apps/web/src/lib/tts/provider.test.ts | 16 ++++++---------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/apps/web/src/app/api/tts/generate/route.test.ts b/apps/web/src/app/api/tts/generate/route.test.ts index 77485f1..6250a9f 100644 --- a/apps/web/src/app/api/tts/generate/route.test.ts +++ b/apps/web/src/app/api/tts/generate/route.test.ts @@ -96,4 +96,18 @@ describe("POST /api/tts/generate", () => { error: "external config missing", }); }); + + test("returns 500 for unexpected non-TtsError exceptions", async () => { + synthesizeImpl = async () => { + throw new Error("unexpected failure"); + }; + + const response = await POST(createRequest({ text: "hello" })); + + expect(response.status).toBe(500); + expect(await response.json()).toEqual({ + error: "Internal server error", + detail: "unexpected failure", + }); + }); }); diff --git a/apps/web/src/lib/tts/provider.test.ts b/apps/web/src/lib/tts/provider.test.ts index 627f9d1..abef510 100644 --- a/apps/web/src/lib/tts/provider.test.ts +++ b/apps/web/src/lib/tts/provider.test.ts @@ -93,16 +93,12 @@ describe("synthesizeSpeechWithFallback", () => { text: "hello", voice: "default", openAiSynthesize: async () => { - throw Object.assign( - new TtsError({ - code: "EXTERNAL_TTS_UPSTREAM", - message: "External TTS request failed: invalid api key", - }), - { - retryable: false, - status: 401, - }, - ); + throw new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: "External TTS request failed: invalid api key", + retryable: false, + status: 401, + }); }, legacySynthesize: async () => { legacyCalled = true; From 97faf992ae09d3fe5a61e71d448590916836d1e5 Mon Sep 17 00:00:00 2001 From: tianhei Date: Thu, 19 Mar 2026 23:59:08 +0800 Subject: [PATCH 12/22] fix(tts): harden upstream audio reads and probes Summary: - reject non-http external base URLs and wrap audio body read failures as structured non-retryable TtsError values - tighten the legacy redirect success test to assert the follow-up fetch uses redirect: error - replace the implementation-plan placeholder with a concrete bun probe for the external adapter entrypoint Rationale: - the latest review found two real gaps where malformed config or body stream failures could escape the expected TTS error semantics - the stricter redirect assertion pins the security-sensitive fetch mode after the validated redirect handoff - the doc probe now matches the shipped adapter and is reproducible Tests: - bun test apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/legacy.test.ts - bun test apps/web/src/lib/tts/fetch-with-timeout.test.ts apps/web/src/lib/tts/provider.test.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/legacy.test.ts apps/web/src/app/api/tts/generate/route.test.ts - bunx @biomejs/biome check apps/web/src/app/api/tts/generate/route.test.ts apps/web/src/lib/tts/openai-compatible.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/legacy.test.ts docs/plans/2026-03-17-tts-external-provider.md - bunx tsc -p apps/web/tsconfig.json --noEmit Co-authored-by: Codex --- apps/web/src/lib/tts/legacy.test.ts | 1 + .../web/src/lib/tts/openai-compatible.test.ts | 42 +++++++++++++++++++ apps/web/src/lib/tts/openai-compatible.ts | 19 ++++++++- .../plans/2026-03-17-tts-external-provider.md | 2 +- 4 files changed, 61 insertions(+), 3 deletions(-) diff --git a/apps/web/src/lib/tts/legacy.test.ts b/apps/web/src/lib/tts/legacy.test.ts index bbe9896..67620db 100644 --- a/apps/web/src/lib/tts/legacy.test.ts +++ b/apps/web/src/lib/tts/legacy.test.ts @@ -141,6 +141,7 @@ describe("synthesizeSpeechWithLegacyProvider", () => { }); } + expect(init?.redirect).toBe("error"); expect(String(input)).toBe( "https://api.milorapart.top/voice/test-redirected.mp3", ); diff --git a/apps/web/src/lib/tts/openai-compatible.test.ts b/apps/web/src/lib/tts/openai-compatible.test.ts index 366065f..eaad7e8 100644 --- a/apps/web/src/lib/tts/openai-compatible.test.ts +++ b/apps/web/src/lib/tts/openai-compatible.test.ts @@ -91,6 +91,18 @@ describe("getExternalTtsConfig", () => { }), ).toThrow("External TTS is not configured"); }); + + test("rejects non-http API_BASE_URL schemes", () => { + expect(() => + getExternalTtsConfig({ + env: { + EXTERNAL_TTS_API_BASE_URL: "mailto:tts@example.com", + EXTERNAL_TTS_API_MODEL: "tts-1", + EXTERNAL_TTS_API_KEY: "secret", + }, + }), + ).toThrow("External TTS is not configured"); + }); }); describe("synthesizeSpeechWithOpenAiCompatible", () => { @@ -269,6 +281,36 @@ describe("synthesizeSpeechWithOpenAiCompatible", () => { expect(Array.from(new Uint8Array(audio))).toEqual([1, 2, 3]); }); + test("wraps arrayBuffer read failures as non-retryable upstream errors", async () => { + const response = new Response(Uint8Array.from([1, 2, 3]), { + status: 200, + headers: { "Content-Type": "audio/mpeg" }, + }); + Object.defineProperty(response, "arrayBuffer", { + value: async () => { + throw new Error("stream failed"); + }, + }); + + await expect( + synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "default", + fetchImpl: async () => response, + }), + ).rejects.toMatchObject({ + code: "EXTERNAL_TTS_UPSTREAM", + message: "External TTS audio read failed: stream failed", + retryable: false, + status: 200, + }); + }); + test("aborts upstream requests that exceed the timeout", async () => { await expect( synthesizeSpeechWithOpenAiCompatible({ diff --git a/apps/web/src/lib/tts/openai-compatible.ts b/apps/web/src/lib/tts/openai-compatible.ts index 8fb42d0..6f55c1d 100644 --- a/apps/web/src/lib/tts/openai-compatible.ts +++ b/apps/web/src/lib/tts/openai-compatible.ts @@ -82,7 +82,11 @@ export function getExternalTtsConfig({ } try { - new URL(apiBaseUrl); + const url = new URL(apiBaseUrl); + + if (url.protocol !== "http:" && url.protocol !== "https:") { + throw new Error("Unsupported protocol"); + } } catch { throw new TtsError({ code: "EXTERNAL_TTS_CONFIG", @@ -228,7 +232,18 @@ export async function synthesizeSpeechWithOpenAiCompatible({ }); } - const audio = await response.arrayBuffer(); + let audio: ArrayBuffer; + + try { + audio = await response.arrayBuffer(); + } catch (error) { + throw new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: `External TTS audio read failed: ${error instanceof Error ? error.message : "Unknown error"}`, + retryable: false, + status: response.status, + }); + } if (audio.byteLength === 0) { throw new TtsError({ diff --git a/docs/plans/2026-03-17-tts-external-provider.md b/docs/plans/2026-03-17-tts-external-provider.md index c05f40d..8b32e0f 100644 --- a/docs/plans/2026-03-17-tts-external-provider.md +++ b/docs/plans/2026-03-17-tts-external-provider.md @@ -125,7 +125,7 @@ Expected: PASS **Step 2: 运行真实外部 TTS 验证** -Run: `bun --eval '<补一段调用适配层的脚本>'` +Run: `bun --eval 'import { getExternalTtsConfig, synthesizeSpeechWithOpenAiCompatible } from "./apps/web/src/lib/tts/openai-compatible.ts"; const config = getExternalTtsConfig({ env: process.env }); const audio = await synthesizeSpeechWithOpenAiCompatible({ config, text: "Cutia TTS probe", voice: "default" }); console.log(audio.byteLength);'` Expected: 输出非空音频字节长度,不打印密钥。 **Step 3: 检查格式与类型** From 7279ef157b7c20118ee8ac54f31ef810c189727d Mon Sep 17 00:00:00 2001 From: tianhei Date: Fri, 20 Mar 2026 00:07:30 +0800 Subject: [PATCH 13/22] test(tts): cover media and timeline insertion flow Summary: - add service-level tests for base64 decoding from `/api/tts/generate` - verify generated speech is uploaded into the media library and inserted onto an available audio track - verify overlapping inserts allocate a new audio track when needed Rationale: - the remaining acceptance surface was the editor-side integration from generated audio into Cutia's media and timeline workflow - these tests give deterministic evidence for the app integration even while the shared external provider still lacks real audio output Tests: - bun test apps/web/src/lib/tts/service.test.ts - bunx @biomejs/biome check apps/web/src/lib/tts/service.test.ts apps/web/src/lib/tts/service.ts - bunx tsc -p apps/web/tsconfig.json --noEmit Co-authored-by: Codex --- apps/web/src/lib/tts/service.test.ts | 220 +++++++++++++++++++++++++++ 1 file changed, 220 insertions(+) create mode 100644 apps/web/src/lib/tts/service.test.ts diff --git a/apps/web/src/lib/tts/service.test.ts b/apps/web/src/lib/tts/service.test.ts new file mode 100644 index 0000000..86624c2 --- /dev/null +++ b/apps/web/src/lib/tts/service.test.ts @@ -0,0 +1,220 @@ +import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test"; +import type { EditorCore } from "@/core"; +import type { AudioTrack } from "@/types/timeline"; +import { generateAndInsertSpeech, generateSpeechFromText } from "./service"; + +const originalFetch = globalThis.fetch; +const originalAudioContext = globalThis.AudioContext; +const originalCreateObjectURL = URL.createObjectURL; + +describe("tts service", () => { + let decodedBytes: number[] | null; + let fakeBuffer: AudioBuffer; + + beforeEach(() => { + decodedBytes = null; + fakeBuffer = { duration: 2.5 } as AudioBuffer; + + Object.defineProperty(globalThis, "AudioContext", { + configurable: true, + value: class FakeAudioContext { + async decodeAudioData(arrayBuffer: ArrayBuffer) { + decodedBytes = Array.from(new Uint8Array(arrayBuffer)); + return fakeBuffer; + } + }, + }); + URL.createObjectURL = mock(() => "blob:tts-preview"); + }); + + afterEach(() => { + globalThis.fetch = originalFetch; + Object.defineProperty(globalThis, "AudioContext", { + configurable: true, + value: originalAudioContext, + }); + URL.createObjectURL = originalCreateObjectURL; + }); + + test("generateSpeechFromText decodes base64 audio returned by the route", async () => { + const fetchCalls: Array<[RequestInfo | URL, RequestInit | undefined]> = []; + globalThis.fetch = (async (input, init) => { + fetchCalls.push([input, init]); + return Response.json({ audio: "AQID" }); + }) as typeof fetch; + + const result = await generateSpeechFromText({ + text: "hello", + voice: "nova", + }); + + expect(fetchCalls).toHaveLength(1); + expect(fetchCalls[0]?.[0]).toBe("/api/tts/generate"); + expect(fetchCalls[0]?.[1]).toMatchObject({ + method: "POST", + headers: { "Content-Type": "application/json" }, + }); + expect(JSON.parse(String(fetchCalls[0]?.[1]?.body))).toEqual({ + text: "hello", + voice: "nova", + }); + expect(decodedBytes).toEqual([1, 2, 3]); + expect(result.duration).toBe(2.5); + expect(result.buffer).toBe(fakeBuffer); + expect(result.blob.type).toBe("audio/mpeg"); + expect(Array.from(new Uint8Array(await result.blob.arrayBuffer()))).toEqual( + [1, 2, 3], + ); + }); + + test("generateAndInsertSpeech uploads generated audio and inserts it into an existing audio track", async () => { + globalThis.fetch = (async () => + Response.json({ audio: "AQID" })) as unknown as typeof fetch; + + const tracks: AudioTrack[] = [ + { + id: "audio-track-1", + name: "Audio 1", + type: "audio", + muted: false, + elements: [], + }, + ]; + const addMediaAssetCalls: unknown[] = []; + const addMediaAssetMock = async (args: unknown) => { + addMediaAssetCalls.push(args); + return "media-1"; + }; + let addTrackCallCount = 0; + const addTrackMock = () => { + addTrackCallCount++; + throw new Error("addTrack should not be called"); + }; + const insertElementCalls: unknown[] = []; + const insertElementMock = (args: unknown) => { + insertElementCalls.push(args); + }; + + const editor = { + media: { + addMediaAsset: addMediaAssetMock, + }, + project: { + getActive: () => ({ + metadata: { id: "project-1" }, + }), + }, + timeline: { + getTracks: () => tracks, + addTrack: addTrackMock, + insertElement: insertElementMock, + }, + } as unknown as EditorCore; + + const result = await generateAndInsertSpeech({ + editor, + text: "hello world", + startTime: 3, + voice: "default", + }); + + expect(result).toEqual({ duration: 2.5 }); + expect(addMediaAssetCalls).toHaveLength(1); + expect(addMediaAssetCalls[0]).toMatchObject({ + projectId: "project-1", + asset: { + name: "TTS: hello world", + type: "audio", + url: "blob:tts-preview", + duration: 2.5, + ephemeral: true, + }, + }); + expect(insertElementCalls).toHaveLength(1); + expect(insertElementCalls[0]).toMatchObject({ + placement: { + mode: "explicit", + trackId: "audio-track-1", + }, + element: { + type: "audio", + sourceType: "upload", + mediaId: "media-1", + name: "TTS: hello world", + duration: 2.5, + startTime: 3, + buffer: fakeBuffer, + }, + }); + expect(addTrackCallCount).toBe(0); + }); + + test("generateAndInsertSpeech creates a new audio track when existing ones overlap", async () => { + globalThis.fetch = (async () => + Response.json({ audio: "AQID" })) as unknown as typeof fetch; + + const tracks: AudioTrack[] = [ + { + id: "audio-track-1", + name: "Audio 1", + type: "audio", + muted: false, + elements: [ + { + id: "audio-el-1", + type: "audio", + sourceType: "upload", + mediaId: "existing-media", + name: "Existing audio", + duration: 10, + startTime: 0, + trimStart: 0, + trimEnd: 0, + volume: 1, + muted: false, + }, + ], + }, + ]; + const addMediaAssetMock = async () => "media-2"; + const addTrackCalls: unknown[] = []; + const addTrackMock = (args: unknown) => { + addTrackCalls.push(args); + return "audio-track-2"; + }; + const insertElementCalls: unknown[] = []; + const insertElementMock = (args: unknown) => { + insertElementCalls.push(args); + }; + + const editor = { + media: { + addMediaAsset: addMediaAssetMock, + }, + project: { + getActive: () => ({ + metadata: { id: "project-1" }, + }), + }, + timeline: { + getTracks: () => tracks, + addTrack: addTrackMock, + insertElement: insertElementMock, + }, + } as unknown as EditorCore; + + await generateAndInsertSpeech({ + editor, + text: "overlap check", + startTime: 2, + }); + + expect(addTrackCalls).toEqual([{ type: "audio" }]); + expect(insertElementCalls[0]).toMatchObject({ + placement: { + mode: "explicit", + trackId: "audio-track-2", + }, + }); + }); +}); From 13588747e4100cc625966c0e15fa2d803696721e Mon Sep 17 00:00:00 2001 From: tianhei Date: Fri, 20 Mar 2026 00:21:05 +0800 Subject: [PATCH 14/22] test(tts): reduce flake and tighten final invariants Summary: - extract shared legacy metadata fixtures and raise timeout test values to reduce scheduler-sensitive flakes - replace the impossible synthetic external fallback response with an explicit invariant error before final upstream error mapping Rationale: - the latest review only pointed out maintainability and test stability issues, but these are cheap to fix and remove avoidable noise - failing loudly on an impossible null upstream response makes future adapter regressions easier to diagnose than masking them as 500s Tests: - bun test apps/web/src/lib/tts/legacy.test.ts apps/web/src/lib/tts/openai-compatible.test.ts - bunx @biomejs/biome check apps/web/src/lib/tts/legacy.test.ts apps/web/src/lib/tts/openai-compatible.ts - bunx tsc -p apps/web/tsconfig.json --noEmit Co-authored-by: Codex --- apps/web/src/lib/tts/legacy.test.ts | 54 +++++++++-------------- apps/web/src/lib/tts/openai-compatible.ts | 12 +++-- 2 files changed, 29 insertions(+), 37 deletions(-) diff --git a/apps/web/src/lib/tts/legacy.test.ts b/apps/web/src/lib/tts/legacy.test.ts index 67620db..950a746 100644 --- a/apps/web/src/lib/tts/legacy.test.ts +++ b/apps/web/src/lib/tts/legacy.test.ts @@ -2,6 +2,16 @@ import { describe, expect, test } from "bun:test"; import { synthesizeSpeechWithLegacyProvider } from "./legacy"; describe("synthesizeSpeechWithLegacyProvider", () => { + const TEST_TIMEOUT_MS = 50; + const LEGACY_AUDIO_URL = "https://api.milorapart.top/voice/test.mp3"; + + function legacyMetadataOk(url = LEGACY_AUDIO_URL): Response { + return Response.json({ + code: 200, + url, + }); + } + test("rejects audio urls outside the expected https host allowlist", async () => { const calls: string[] = []; @@ -10,10 +20,7 @@ describe("synthesizeSpeechWithLegacyProvider", () => { text: "hello", fetchImpl: async (input) => { calls.push(String(input)); - return Response.json({ - code: 200, - url: "http://127.0.0.1/internal.mp3", - }); + return legacyMetadataOk("http://127.0.0.1/internal.mp3"); }, }), ).rejects.toThrow("Legacy TTS returned an unexpected audio URL"); @@ -27,10 +34,7 @@ describe("synthesizeSpeechWithLegacyProvider", () => { text: "hello", fetchImpl: async (input) => { if (String(input).includes("/apis/mbAIsc?")) { - return Response.json({ - code: 200, - url: "https://api.milorapart.top/voice/test.mp3", - }); + return legacyMetadataOk(); } return new Response("", { @@ -48,10 +52,7 @@ describe("synthesizeSpeechWithLegacyProvider", () => { text: "hello", fetchImpl: async (input) => { if (String(input).includes("/apis/mbAIsc?")) { - return Response.json({ - code: 200, - url: "https://api.milorapart.top/voice/test.mp3", - }); + return legacyMetadataOk(); } return new Response(Uint8Array.from([1, 2, 3]), { @@ -67,10 +68,7 @@ describe("synthesizeSpeechWithLegacyProvider", () => { text: "hello", fetchImpl: async (input) => { if (String(input).includes("/apis/mbAIsc?")) { - return Response.json({ - code: 200, - url: "https://api.milorapart.top/voice/test.mp3", - }); + return legacyMetadataOk(); } return new Response(Uint8Array.from([1, 2, 3]), { @@ -91,10 +89,7 @@ describe("synthesizeSpeechWithLegacyProvider", () => { text: "hello", fetchImpl: async (input, init) => { if (String(input).includes("/apis/mbAIsc?")) { - return Response.json({ - code: 200, - url: "https://api.milorapart.top/voice/test.mp3", - }); + return legacyMetadataOk(); } sawManualRedirect = init?.redirect === "manual"; @@ -122,10 +117,7 @@ describe("synthesizeSpeechWithLegacyProvider", () => { text: "hello", fetchImpl: async (input, init) => { if (String(input).includes("/apis/mbAIsc?")) { - return Response.json({ - code: 200, - url: "https://api.milorapart.top/voice/test.mp3", - }); + return legacyMetadataOk(); } downloadCallCount++; @@ -164,10 +156,7 @@ describe("synthesizeSpeechWithLegacyProvider", () => { text: "中".repeat(400), fetchImpl: async () => { fetchCalled = true; - return Response.json({ - code: 200, - url: "https://api.milorapart.top/voice/test.mp3", - }); + return legacyMetadataOk(); }, }), ).rejects.toThrow("Legacy TTS text is too long for GET fallback"); @@ -179,7 +168,7 @@ describe("synthesizeSpeechWithLegacyProvider", () => { await expect( synthesizeSpeechWithLegacyProvider({ text: "hello", - timeoutMs: 10, + timeoutMs: TEST_TIMEOUT_MS, fetchImpl: async (_input, init) => new Promise((_resolve, reject) => { init?.signal?.addEventListener( @@ -201,15 +190,12 @@ describe("synthesizeSpeechWithLegacyProvider", () => { await expect( synthesizeSpeechWithLegacyProvider({ text: "hello", - timeoutMs: 10, + timeoutMs: TEST_TIMEOUT_MS, fetchImpl: async (_input, init) => { callCount++; if (callCount === 1) { - return Response.json({ - code: 200, - url: "https://api.milorapart.top/voice/test.mp3", - }); + return legacyMetadataOk(); } return new Promise((_resolve, reject) => { diff --git a/apps/web/src/lib/tts/openai-compatible.ts b/apps/web/src/lib/tts/openai-compatible.ts index 6f55c1d..0f272bc 100644 --- a/apps/web/src/lib/tts/openai-compatible.ts +++ b/apps/web/src/lib/tts/openai-compatible.ts @@ -264,12 +264,18 @@ export async function synthesizeSpeechWithOpenAiCompatible({ } } + if (!lastErrorResponse) { + throw new Error( + "Expected external TTS to capture an upstream response before failing", + ); + } + throw new TtsError({ code: "EXTERNAL_TTS_UPSTREAM", message: `External TTS request failed: ${await getUpstreamErrorMessage({ - response: lastErrorResponse ?? new Response(null, { status: 500 }), + response: lastErrorResponse, })}`, - retryable: isRetryableStatus(lastErrorResponse?.status), - status: lastErrorResponse?.status, + retryable: isRetryableStatus(lastErrorResponse.status), + status: lastErrorResponse.status, }); } From dcc5a176c9e39d30664e80f6c7e221a85ca19e2a Mon Sep 17 00:00:00 2001 From: tianhei Date: Fri, 20 Mar 2026 00:43:58 +0800 Subject: [PATCH 15/22] docs(tts): document external provider env vars Summary: - add external TTS env examples to apps/web/.env.example - document preferred EXTERNAL_TTS_API_* settings in the README Rationale: - keep repo-local setup docs aligned with the shipped TTS adapter - reduce configuration drift while origin push and live provider remain blocked Tests: - rg -n "EXTERNAL_TTS_API_BASE_URL|EXTERNAL_TTS_API_MODEL|EXTERNAL_TTS_API_KEY|compatibility aliases|Optional TTS env values" apps/web/.env.example README.md - git diff --check Co-authored-by: Codex --- README.md | 12 ++++++++++++ apps/web/.env.example | 12 +++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e26ac14..eac68fa 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,18 @@ UPSTASH_REDIS_REST_TOKEN="cutia_redis_token" NODE_ENV="development" ``` +Optional TTS env values: + +```bash +EXTERNAL_TTS_API_BASE_URL="https://your-tts-provider.example.com/v1" +EXTERNAL_TTS_API_MODEL="your_tts_model" +EXTERNAL_TTS_API_KEY="your_tts_api_key" +``` + +Cutia prefers `EXTERNAL_TTS_API_*` for external speech synthesis. The legacy +`API_BASE_URL` / `API_MODEL` / `API_KEY` names are still accepted as +compatibility aliases when the namespaced variables are absent. + To enable authentication, also start PostgreSQL and add these env values: ```bash diff --git a/apps/web/.env.example b/apps/web/.env.example index 85e483b..0b78b6c 100644 --- a/apps/web/.env.example +++ b/apps/web/.env.example @@ -16,9 +16,19 @@ UPSTASH_REDIS_REST_TOKEN=example_token_here FREESOUND_CLIENT_ID=your_client_id_here FREESOUND_API_KEY=your_api_key_here +# Optional: external OpenAI-compatible TTS provider +# Preferred namespaced variables: +EXTERNAL_TTS_API_BASE_URL=https://your-tts-provider.example.com/v1 +EXTERNAL_TTS_API_MODEL=your_tts_model +EXTERNAL_TTS_API_KEY=your_tts_api_key +# Compatibility aliases used when EXTERNAL_TTS_* is absent: +# API_BASE_URL=https://your-shared-api.example.com/v1 +# API_MODEL=your_tts_model +# API_KEY=your_tts_api_key + # Cloudflare R2 (for reference image uploads) R2_ACCOUNT_ID=your_r2_account_id R2_ACCESS_KEY_ID=your_r2_access_key_id R2_SECRET_ACCESS_KEY=your_r2_secret_access_key R2_BUCKET_NAME=your_r2_bucket_name -R2_PUBLIC_URL=https://your-r2-public-url.example.com \ No newline at end of file +R2_PUBLIC_URL=https://your-r2-public-url.example.com From 1b99bcc9a7a1a3fb7db8aef13badf927b800c0e7 Mon Sep 17 00:00:00 2001 From: tianhei Date: Fri, 20 Mar 2026 00:49:04 +0800 Subject: [PATCH 16/22] docs(tts): add live probe commands Summary: - document an adapter-level external TTS probe in the README - document an end-to-end route probe in the README Rationale: - make future provider validation runnable without digging through Linear - reduce manual handoff friction while external blockers remain Tests: - rg -n "Cutia TTS probe|Cutia route probe|verify that the configured provider can actually return audio|verify the route end-to-end" README.md - git diff --check Co-authored-by: Codex --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index eac68fa..7d43b9b 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,18 @@ Cutia prefers `EXTERNAL_TTS_API_*` for external speech synthesis. The legacy `API_BASE_URL` / `API_MODEL` / `API_KEY` names are still accepted as compatibility aliases when the namespaced variables are absent. +To verify that the configured provider can actually return audio, run: + +```bash +bun --eval 'import { getExternalTtsConfig, synthesizeSpeechWithOpenAiCompatible } from "./src/lib/tts/openai-compatible.ts"; const config = getExternalTtsConfig({ env: process.env }); const audio = await synthesizeSpeechWithOpenAiCompatible({ config, text: "Cutia TTS probe", voice: "default" }); console.log(audio.byteLength);' +``` + +If you want to verify the route end-to-end from the app directory, run: + +```bash +NODE_ENV=development NEXT_PUBLIC_SITE_URL=http://localhost:3000 UPSTASH_REDIS_REST_URL=http://localhost:8079 UPSTASH_REDIS_REST_TOKEN=cutia_redis_token bun --eval 'import { NextRequest } from "next/server"; import { POST } from "./src/app/api/tts/generate/route.ts"; const request = new NextRequest("http://localhost/api/tts/generate", { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ text: "Cutia route probe", voice: "default" }) }); const response = await POST(request); console.log(response.status); console.log(await response.text());' +``` + To enable authentication, also start PostgreSQL and add these env values: ```bash From db757802866f3e29e4bc2e9882675e96d22b554f Mon Sep 17 00:00:00 2001 From: tianhei Date: Fri, 20 Mar 2026 01:17:39 +0800 Subject: [PATCH 17/22] feat(tts): add responses websocket fallback Summary: - add a /responses WebSocket fallback to the external TTS adapter when /audio/speech is unavailable - assemble audio chunks from response audio delta events and map websocket close reasons into structured upstream errors - extend TTS adapter and provider tests to cover the new fallback path and updated error behavior Rationale: - the active shared gateway exposes TTS generation through /responses over WebSocket rather than only /audio/speech - supporting both compatibility shapes narrows the remaining live blocker to provider account availability instead of protocol gaps Tests: - bun test ./src/lib/tts/service.test.ts ./src/lib/tts/fetch-with-timeout.test.ts ./src/lib/tts/provider.test.ts ./src/lib/tts/openai-compatible.test.ts ./src/lib/tts/legacy.test.ts ./src/app/api/tts/generate/route.test.ts - bun --eval 'import { getExternalTtsConfig, synthesizeSpeechWithOpenAiCompatible } from "./src/lib/tts/openai-compatible.ts"; const config = getExternalTtsConfig({ env: process.env }); const audio = await synthesizeSpeechWithOpenAiCompatible({ config, text: "Cutia TTS probe", voice: "default" }); console.log(audio.byteLength);' (fails: External TTS websocket request failed: no available account) - NODE_ENV=development NEXT_PUBLIC_SITE_URL=http://localhost:3000 UPSTASH_REDIS_REST_URL=http://localhost:8079 UPSTASH_REDIS_REST_TOKEN=cutia_redis_token bun --eval 'import { NextRequest } from "next/server"; import { POST } from "./src/app/api/tts/generate/route.ts"; const request = new NextRequest("http://localhost/api/tts/generate", { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ text: "Cutia route probe", voice: "default" }) }); const response = await POST(request); console.log(response.status); console.log(await response.text());' (returns 502 with no available account) Co-authored-by: Codex --- .../web/src/lib/tts/openai-compatible.test.ts | 172 +++++++- apps/web/src/lib/tts/openai-compatible.ts | 404 +++++++++++++++++- apps/web/src/lib/tts/provider.test.ts | 21 +- 3 files changed, 577 insertions(+), 20 deletions(-) diff --git a/apps/web/src/lib/tts/openai-compatible.test.ts b/apps/web/src/lib/tts/openai-compatible.test.ts index eaad7e8..383e6fd 100644 --- a/apps/web/src/lib/tts/openai-compatible.test.ts +++ b/apps/web/src/lib/tts/openai-compatible.test.ts @@ -5,6 +5,52 @@ import { synthesizeSpeechWithOpenAiCompatible, } from "./openai-compatible"; +type WebSocketListenerMap = { + close: Array<(event: { code: number; reason: string }) => void>; + error: Array<(event: { message?: string; type?: string }) => void>; + message: Array<(event: { data: unknown }) => void>; + open: Array<() => void>; +}; + +class FakeWebSocket { + public readonly sentMessages: string[] = []; + private readonly listeners: WebSocketListenerMap = { + close: [], + error: [], + message: [], + open: [], + }; + + constructor( + public readonly url: string, + public readonly init?: { headers?: Record }, + ) {} + + addEventListener( + type: K, + listener: WebSocketListenerMap[K][number], + ) { + this.listeners[type].push(listener); + } + + close(code = 1000, reason = "") { + this.emit("close", { code, reason }); + } + + emit( + type: K, + event: Parameters[0], + ) { + for (const listener of this.listeners[type]) { + listener(event as never); + } + } + + send(message: string) { + this.sentMessages.push(message); + } +} + describe("getExternalTtsConfig", () => { test("reads namespaced TTS config from environment", () => { const config = getExternalTtsConfig({ @@ -236,9 +282,9 @@ describe("synthesizeSpeechWithOpenAiCompatible", () => { text: "hello", voice: "default", fetchImpl: async () => - new Response("", { + new Response("not audio", { status: 200, - headers: { "Content-Type": "text/html; charset=utf-8" }, + headers: { "Content-Type": "text/plain; charset=utf-8" }, }), }), ).rejects.toThrow("Expected audio response"); @@ -398,4 +444,126 @@ describe("synthesizeSpeechWithOpenAiCompatible", () => { }), ).rejects.toThrow('{"message":"bad request"}'); }); + + test("falls back to /responses websocket audio when /audio/speech returns 404", async () => { + const sockets: FakeWebSocket[] = []; + const synthesis = synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "default", + createWebSocket: (url, init) => { + const socket = new FakeWebSocket(url, init); + sockets.push(socket); + return socket; + }, + fetchImpl: async () => new Response("page not found", { status: 404 }), + }); + await new Promise((resolve) => setTimeout(resolve, 0)); + + expect(sockets).toHaveLength(1); + expect(sockets[0]?.url).toBe("wss://example.com/v1/responses"); + expect(sockets[0]?.init?.headers?.Authorization).toBe("Bearer secret"); + + sockets[0]?.emit("open", undefined); + expect(JSON.parse(sockets[0]?.sentMessages[0] ?? "")).toEqual({ + audio: { format: "mp3" }, + input: "hello", + model: "tts-1", + output_modalities: ["audio"], + response: { + instructions: "hello", + modalities: ["audio"], + output_audio_format: "mp3", + voice: DEFAULT_EXTERNAL_TTS_VOICE, + }, + type: "response.create", + }); + sockets[0]?.emit("message", { + data: JSON.stringify({ + type: "response.audio.delta", + delta: Buffer.from(Uint8Array.from([7, 8, 9])).toString("base64"), + }), + }); + sockets[0]?.emit("message", { + data: JSON.stringify({ type: "response.completed" }), + }); + + expect(Array.from(new Uint8Array(await synthesis))).toEqual([7, 8, 9]); + }); + + test("falls back to /responses websocket audio when /audio/speech returns html", async () => { + const sockets: FakeWebSocket[] = []; + const synthesis = synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "echo", + createWebSocket: (url, init) => { + const socket = new FakeWebSocket(url, init); + sockets.push(socket); + return socket; + }, + fetchImpl: async () => + new Response("", { + status: 200, + headers: { "Content-Type": "text/html; charset=utf-8" }, + }), + }); + await new Promise((resolve) => setTimeout(resolve, 0)); + + sockets[0]?.emit("open", undefined); + sockets[0]?.emit("message", { + data: JSON.stringify({ + type: "response.output_audio.delta", + delta: Buffer.from(Uint8Array.from([1, 2, 3, 4])).toString("base64"), + }), + }); + sockets[0]?.emit("message", { + data: JSON.stringify({ type: "response.done" }), + }); + + expect(Array.from(new Uint8Array(await synthesis))).toEqual([1, 2, 3, 4]); + expect(JSON.parse(sockets[0]?.sentMessages[0] ?? "").response.voice).toBe( + "echo", + ); + }); + + test("surfaces websocket close reasons as structured upstream errors", async () => { + const sockets: FakeWebSocket[] = []; + const synthesis = synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "default", + createWebSocket: (url, init) => { + const socket = new FakeWebSocket(url, init); + sockets.push(socket); + return socket; + }, + fetchImpl: async () => new Response("page not found", { status: 404 }), + }); + await new Promise((resolve) => setTimeout(resolve, 0)); + + sockets[0]?.emit("open", undefined); + sockets[0]?.emit("close", { + code: 1013, + reason: "no available account", + }); + + await expect(synthesis).rejects.toMatchObject({ + code: "EXTERNAL_TTS_UPSTREAM", + message: "External TTS websocket request failed: no available account", + retryable: false, + }); + }); }); diff --git a/apps/web/src/lib/tts/openai-compatible.ts b/apps/web/src/lib/tts/openai-compatible.ts index 0f272bc..0cbf392 100644 --- a/apps/web/src/lib/tts/openai-compatible.ts +++ b/apps/web/src/lib/tts/openai-compatible.ts @@ -12,6 +12,7 @@ const externalTtsConfigSchema = z.object({ API_KEY: z.string().min(1), }); const EXTERNAL_TTS_TIMEOUT_MS = 15_000; +const EXTERNAL_TTS_RESPONSES_AUDIO_FORMAT = "mp3"; export { DEFAULT_EXTERNAL_TTS_VOICE }; @@ -21,6 +22,58 @@ export interface ExternalTtsConfig { model: string; } +interface ExternalTtsWebSocketMessageEvent { + data: unknown; +} + +interface ExternalTtsWebSocketErrorEvent { + message?: string; + type?: string; +} + +interface ExternalTtsWebSocketCloseEvent { + code: number; + reason: string; +} + +export interface ExternalTtsWebSocketLike { + addEventListener( + type: "close", + listener: (event: ExternalTtsWebSocketCloseEvent) => void, + ): void; + addEventListener( + type: "error", + listener: (event: ExternalTtsWebSocketErrorEvent) => void, + ): void; + addEventListener( + type: "message", + listener: (event: ExternalTtsWebSocketMessageEvent) => void, + ): void; + addEventListener(type: "open", listener: () => void): void; + close(code?: number, reason?: string): void; + removeEventListener?( + type: "close", + listener: (event: ExternalTtsWebSocketCloseEvent) => void, + ): void; + removeEventListener?( + type: "error", + listener: (event: ExternalTtsWebSocketErrorEvent) => void, + ): void; + removeEventListener?( + type: "message", + listener: (event: ExternalTtsWebSocketMessageEvent) => void, + ): void; + removeEventListener?(type: "open", listener: () => void): void; + send(data: string): void; +} + +export type ExternalTtsWebSocketFactory = ( + url: string, + init?: { + headers?: Record; + }, +) => ExternalTtsWebSocketLike; + function resolveExternalTtsEnv({ env, }: { @@ -169,14 +222,343 @@ function getSpeechEndpointUrls({ return [...new Set(urls)]; } +function getResponsesEndpointUrls({ + apiBaseUrl, +}: { + apiBaseUrl: string; +}): string[] { + const normalizedBaseUrl = apiBaseUrl.replace(/\/+$/, ""); + const baseWithoutV1 = normalizedBaseUrl.endsWith("/v1") + ? normalizedBaseUrl.slice(0, -3) + : normalizedBaseUrl; + const baseWithV1 = normalizedBaseUrl.endsWith("/v1") + ? normalizedBaseUrl + : `${normalizedBaseUrl}/v1`; + const urls = [`${baseWithV1}/responses`, `${baseWithoutV1}/responses`]; + + return [...new Set(urls)]; +} + +function toWebSocketUrl({ url }: { url: string }): string { + const parsed = new URL(url); + parsed.protocol = parsed.protocol === "https:" ? "wss:" : "ws:"; + return parsed.toString(); +} + +function isAudioContentType({ contentType }: { contentType: string }): boolean { + const mimeType = contentType.split(";")[0]?.trim().toLowerCase() ?? ""; + + return mimeType.startsWith("audio/") || mimeType === "application/octet-stream"; +} + +function shouldTryResponsesWebSocket({ + response, +}: { + response: Response; +}): boolean { + if (response.status === 404 || response.status === 405 || response.status === 426) { + return true; + } + + if (!response.ok) { + return false; + } + + const contentType = response.headers.get("content-type") ?? ""; + const mimeType = contentType.split(";")[0]?.trim().toLowerCase() ?? ""; + + return mimeType === "text/html"; +} + +function getResponsesWebSocketCloseRetryable({ + code, + reason, +}: { + code: number; + reason: string; +}): boolean { + const normalizedReason = reason.trim().toLowerCase(); + + if ( + normalizedReason.includes("no available account") || + normalizedReason.includes("required") || + normalizedReason.includes("unsupported") + ) { + return false; + } + + return code === 1006 || code === 1011 || code === 1012 || code === 1013; +} + +function getResponsesWebSocketError({ + code, + reason, +}: { + code: number; + reason: string; +}): TtsError { + return new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: `External TTS websocket request failed: ${ + reason || `WebSocket closed (${code})` + }`, + retryable: getResponsesWebSocketCloseRetryable({ code, reason }), + }); +} + +function getResponseEventErrorMessage({ + event, +}: { + event: Record; +}): string | null { + if (typeof event.message === "string" && event.message.trim()) { + return event.message; + } + + if ( + typeof event.error === "object" && + event.error !== null && + "message" in event.error && + typeof event.error.message === "string" && + event.error.message.trim() + ) { + return event.error.message; + } + + return null; +} + +function createExternalTtsWebSocket( + url: string, + init?: { headers?: Record }, +): ExternalTtsWebSocketLike { + type NodeCompatibleWebSocket = new ( + url: string, + init?: { headers?: Record }, + ) => ExternalTtsWebSocketLike; + + const WebSocketCtor = + globalThis.WebSocket as unknown as NodeCompatibleWebSocket; + + return new WebSocketCtor(url, init); +} + +async function synthesizeSpeechWithResponsesWebSocket({ + config, + createWebSocket = createExternalTtsWebSocket, + text, + voice, +}: { + config: ExternalTtsConfig; + createWebSocket?: ExternalTtsWebSocketFactory; + text: string; + voice?: string; +}): Promise { + const endpointUrl = toWebSocketUrl({ + url: + getResponsesEndpointUrls({ apiBaseUrl: config.apiBaseUrl })[0] ?? + `${config.apiBaseUrl.replace(/\/+$/, "")}/responses`, + }); + const audioChunks: Uint8Array[] = []; + + return await new Promise((resolve, reject) => { + const socket = createWebSocket(endpointUrl, { + headers: { + Authorization: `Bearer ${config.apiKey}`, + }, + }); + let settled = false; + + const cleanup = () => { + socket.removeEventListener?.("close", handleClose); + socket.removeEventListener?.("error", handleError); + socket.removeEventListener?.("message", handleMessage); + socket.removeEventListener?.("open", handleOpen); + }; + + const finish = ({ + error, + value, + }: { + error?: TtsError; + value?: ArrayBuffer; + }) => { + if (settled) { + return; + } + + settled = true; + cleanup(); + + try { + socket.close(); + } catch { + // Best effort cleanup only. + } + + if (error) { + reject(error); + return; + } + + resolve(value ?? new ArrayBuffer(0)); + }; + + const handleOpen = () => { + try { + socket.send( + JSON.stringify({ + audio: { + format: EXTERNAL_TTS_RESPONSES_AUDIO_FORMAT, + }, + input: text, + model: config.model, + output_modalities: ["audio"], + response: { + instructions: text, + modalities: ["audio"], + output_audio_format: EXTERNAL_TTS_RESPONSES_AUDIO_FORMAT, + voice: resolveVoice({ voice }), + }, + type: "response.create", + }), + ); + } catch (error) { + finish({ + error: wrapExternalUpstreamError({ error }), + }); + } + }; + + const handleMessage = async ({ + data, + }: ExternalTtsWebSocketMessageEvent) => { + try { + if (data instanceof Blob) { + audioChunks.push(new Uint8Array(await data.arrayBuffer())); + return; + } + + if (data instanceof ArrayBuffer) { + audioChunks.push(new Uint8Array(data)); + return; + } + + if (ArrayBuffer.isView(data)) { + audioChunks.push( + new Uint8Array( + data.buffer.slice( + data.byteOffset, + data.byteOffset + data.byteLength, + ), + ), + ); + return; + } + + if (typeof data !== "string") { + return; + } + + const event = JSON.parse(data) as Record; + const type = typeof event.type === "string" ? event.type : ""; + + if ( + type === "response.audio.delta" || + type === "response.output_audio.delta" + ) { + if (typeof event.delta === "string" && event.delta.length > 0) { + audioChunks.push(Uint8Array.from(Buffer.from(event.delta, "base64"))); + } + return; + } + + if (type === "response.completed" || type === "response.done") { + const audio = Buffer.concat( + audioChunks.map((chunk) => Buffer.from(chunk)), + ); + + if (audio.byteLength === 0) { + finish({ + error: new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: "External TTS returned empty audio", + retryable: false, + }), + }); + return; + } + + finish({ + value: audio.buffer.slice( + audio.byteOffset, + audio.byteOffset + audio.byteLength, + ), + }); + return; + } + + if ( + type === "error" || + type === "response.error" || + type === "response.failed" || + type === "response.incomplete" + ) { + finish({ + error: new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: + getResponseEventErrorMessage({ event }) ?? + "External TTS websocket request failed", + retryable: false, + }), + }); + } + } catch (error) { + finish({ + error: wrapExternalUpstreamError({ error }), + }); + } + }; + + const handleError = (event: ExternalTtsWebSocketErrorEvent) => { + finish({ + error: new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: + event.message?.trim() || "External TTS websocket request failed", + retryable: true, + }), + }); + }; + + const handleClose = ({ code, reason }: ExternalTtsWebSocketCloseEvent) => { + if (settled) { + return; + } + + finish({ + error: getResponsesWebSocketError({ code, reason }), + }); + }; + + socket.addEventListener("open", handleOpen); + socket.addEventListener("message", handleMessage); + socket.addEventListener("error", handleError); + socket.addEventListener("close", handleClose); + }); +} + export async function synthesizeSpeechWithOpenAiCompatible({ config, + createWebSocket = createExternalTtsWebSocket, text, voice, fetchImpl = fetch, timeoutMs = EXTERNAL_TTS_TIMEOUT_MS, }: { config: ExternalTtsConfig; + createWebSocket?: ExternalTtsWebSocketFactory; text: string; voice?: string; fetchImpl?: FetchLike; @@ -218,12 +600,12 @@ export async function synthesizeSpeechWithOpenAiCompatible({ if (response.ok) { const contentType = response.headers.get("content-type") ?? ""; - const mimeType = contentType.split(";")[0]?.trim().toLowerCase() ?? ""; + if (!isAudioContentType({ contentType })) { + if (shouldTryResponsesWebSocket({ response })) { + lastErrorResponse = response; + break; + } - if ( - !mimeType.startsWith("audio/") && - mimeType !== "application/octet-stream" - ) { throw new TtsError({ code: "EXTERNAL_TTS_UPSTREAM", message: `Expected audio response, received ${contentType || "(no content-type)"}`, @@ -264,6 +646,18 @@ export async function synthesizeSpeechWithOpenAiCompatible({ } } + if ( + lastErrorResponse && + shouldTryResponsesWebSocket({ response: lastErrorResponse }) + ) { + return synthesizeSpeechWithResponsesWebSocket({ + config, + createWebSocket, + text, + voice, + }); + } + if (!lastErrorResponse) { throw new Error( "Expected external TTS to capture an upstream response before failing", diff --git a/apps/web/src/lib/tts/provider.test.ts b/apps/web/src/lib/tts/provider.test.ts index abef510..2487e35 100644 --- a/apps/web/src/lib/tts/provider.test.ts +++ b/apps/web/src/lib/tts/provider.test.ts @@ -1,6 +1,5 @@ import { describe, expect, test } from "bun:test"; import { TtsError } from "./errors"; -import { synthesizeSpeechWithOpenAiCompatible } from "./openai-compatible"; import { synthesizeSpeechWithFallback } from "./provider"; describe("synthesizeSpeechWithFallback", () => { @@ -126,17 +125,13 @@ describe("synthesizeSpeechWithFallback", () => { }, text: "hello", voice: "default", - openAiSynthesize: ({ config, text, voice }) => - synthesizeSpeechWithOpenAiCompatible({ - config, - text, - voice, - fetchImpl: async () => - new Response("", { - status: 200, - headers: { "Content-Type": "text/html; charset=utf-8" }, - }), - }), + openAiSynthesize: async () => { + throw new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: "External TTS websocket request failed: no available account", + retryable: false, + }); + }, legacySynthesize: async () => { legacyCalled = true; return Uint8Array.from([7, 8, 9]).buffer; @@ -144,7 +139,7 @@ describe("synthesizeSpeechWithFallback", () => { }), ).rejects.toMatchObject({ code: "EXTERNAL_TTS_UPSTREAM", - message: "Expected audio response, received text/html; charset=utf-8", + message: "External TTS websocket request failed: no available account", retryable: false, }); From 9be976b084b9a11e9327005f7287b6ad99416d03 Mon Sep 17 00:00:00 2001 From: tianhei Date: Fri, 20 Mar 2026 02:38:36 +0800 Subject: [PATCH 18/22] test(tts): fix websocket test typing Summary: - fix the FakeWebSocket helper typings in openai-compatible tests so tsc accepts eventful listeners without collapsing them to the open handler - update the websocket test call sites to use a no-arg open event helper - apply biome formatting to the touched TTS files Rationale: - the repo-level web typecheck was still failing even though the test suite passed, which left the ticket with an unverified validation gap - keeping the fix scoped to the test helper preserves runtime behavior while restoring the full validation surface for the TTS work Tests: - bun test apps/web/src/lib/tts/openai-compatible.test.ts - bun test apps/web/src/lib/tts/fetch-with-timeout.test.ts apps/web/src/lib/tts/legacy.test.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/provider.test.ts apps/web/src/lib/tts/service.test.ts apps/web/src/app/api/tts/generate/route.test.ts - bunx tsc -p apps/web/tsconfig.json --noEmit - bunx @biomejs/biome check apps/web/src/app/api/tts/generate/route.ts apps/web/src/app/api/tts/generate/route.test.ts apps/web/src/lib/tts/errors.ts apps/web/src/lib/tts/fetch-with-timeout.ts apps/web/src/lib/tts/fetch-with-timeout.test.ts apps/web/src/lib/tts/legacy.ts apps/web/src/lib/tts/legacy.test.ts apps/web/src/lib/tts/openai-compatible.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/provider.ts apps/web/src/lib/tts/provider.test.ts apps/web/src/lib/tts/service.ts apps/web/src/lib/tts/service.test.ts apps/web/src/constants/tts-constants.ts packages/env/src/web.ts README.md apps/web/.env.example docs/plans/2026-03-17-tts-external-provider-design.md docs/plans/2026-03-17-tts-external-provider.md Co-authored-by: Codex --- .../web/src/lib/tts/openai-compatible.test.ts | 54 ++++++++++++++----- apps/web/src/lib/tts/openai-compatible.ts | 14 +++-- apps/web/src/lib/tts/provider.test.ts | 3 +- 3 files changed, 54 insertions(+), 17 deletions(-) diff --git a/apps/web/src/lib/tts/openai-compatible.test.ts b/apps/web/src/lib/tts/openai-compatible.test.ts index 383e6fd..2372246 100644 --- a/apps/web/src/lib/tts/openai-compatible.test.ts +++ b/apps/web/src/lib/tts/openai-compatible.test.ts @@ -26,23 +26,51 @@ class FakeWebSocket { public readonly init?: { headers?: Record }, ) {} - addEventListener( - type: K, - listener: WebSocketListenerMap[K][number], + addEventListener( + type: "close", + listener: WebSocketListenerMap["close"][number], + ): void; + addEventListener( + type: "error", + listener: WebSocketListenerMap["error"][number], + ): void; + addEventListener( + type: "message", + listener: WebSocketListenerMap["message"][number], + ): void; + addEventListener( + type: "open", + listener: WebSocketListenerMap["open"][number], + ): void; + addEventListener( + type: keyof WebSocketListenerMap, + listener: WebSocketListenerMap[keyof WebSocketListenerMap][number], ) { - this.listeners[type].push(listener); + ( + this.listeners[type] as Array< + ( + event?: + | { code: number; reason: string } + | { message?: string; type?: string } + | { data: unknown }, + ) => void + > + ).push(listener as (event?: unknown) => void); } close(code = 1000, reason = "") { this.emit("close", { code, reason }); } - emit( - type: K, - event: Parameters[0], - ) { - for (const listener of this.listeners[type]) { - listener(event as never); + emit(type: "close", event: { code: number; reason: string }): void; + emit(type: "error", event: { message?: string; type?: string }): void; + emit(type: "message", event: { data: unknown }): void; + emit(type: "open"): void; + emit(type: keyof WebSocketListenerMap, event?: unknown) { + for (const listener of this.listeners[type] as Array< + (event?: unknown) => void + >) { + listener(event); } } @@ -468,7 +496,7 @@ describe("synthesizeSpeechWithOpenAiCompatible", () => { expect(sockets[0]?.url).toBe("wss://example.com/v1/responses"); expect(sockets[0]?.init?.headers?.Authorization).toBe("Bearer secret"); - sockets[0]?.emit("open", undefined); + sockets[0]?.emit("open"); expect(JSON.parse(sockets[0]?.sentMessages[0] ?? "")).toEqual({ audio: { format: "mp3" }, input: "hello", @@ -518,7 +546,7 @@ describe("synthesizeSpeechWithOpenAiCompatible", () => { }); await new Promise((resolve) => setTimeout(resolve, 0)); - sockets[0]?.emit("open", undefined); + sockets[0]?.emit("open"); sockets[0]?.emit("message", { data: JSON.stringify({ type: "response.output_audio.delta", @@ -554,7 +582,7 @@ describe("synthesizeSpeechWithOpenAiCompatible", () => { }); await new Promise((resolve) => setTimeout(resolve, 0)); - sockets[0]?.emit("open", undefined); + sockets[0]?.emit("open"); sockets[0]?.emit("close", { code: 1013, reason: "no available account", diff --git a/apps/web/src/lib/tts/openai-compatible.ts b/apps/web/src/lib/tts/openai-compatible.ts index 0cbf392..9eb4a3f 100644 --- a/apps/web/src/lib/tts/openai-compatible.ts +++ b/apps/web/src/lib/tts/openai-compatible.ts @@ -248,7 +248,9 @@ function toWebSocketUrl({ url }: { url: string }): string { function isAudioContentType({ contentType }: { contentType: string }): boolean { const mimeType = contentType.split(";")[0]?.trim().toLowerCase() ?? ""; - return mimeType.startsWith("audio/") || mimeType === "application/octet-stream"; + return ( + mimeType.startsWith("audio/") || mimeType === "application/octet-stream" + ); } function shouldTryResponsesWebSocket({ @@ -256,7 +258,11 @@ function shouldTryResponsesWebSocket({ }: { response: Response; }): boolean { - if (response.status === 404 || response.status === 405 || response.status === 426) { + if ( + response.status === 404 || + response.status === 405 || + response.status === 426 + ) { return true; } @@ -468,7 +474,9 @@ async function synthesizeSpeechWithResponsesWebSocket({ type === "response.output_audio.delta" ) { if (typeof event.delta === "string" && event.delta.length > 0) { - audioChunks.push(Uint8Array.from(Buffer.from(event.delta, "base64"))); + audioChunks.push( + Uint8Array.from(Buffer.from(event.delta, "base64")), + ); } return; } diff --git a/apps/web/src/lib/tts/provider.test.ts b/apps/web/src/lib/tts/provider.test.ts index 2487e35..3269ce9 100644 --- a/apps/web/src/lib/tts/provider.test.ts +++ b/apps/web/src/lib/tts/provider.test.ts @@ -128,7 +128,8 @@ describe("synthesizeSpeechWithFallback", () => { openAiSynthesize: async () => { throw new TtsError({ code: "EXTERNAL_TTS_UPSTREAM", - message: "External TTS websocket request failed: no available account", + message: + "External TTS websocket request failed: no available account", retryable: false, }); }, From 2068474992b261e9ba31be7a90bd523b23514cff Mon Sep 17 00:00:00 2001 From: tianhei Date: Fri, 20 Mar 2026 02:54:04 +0800 Subject: [PATCH 19/22] fix(tts): fall back on account exhaustion Summary: - treat websocket `no available account` closes as retryable external upstream errors - add regression coverage proving provider fallback recovers from account exhaustion while keeping non-retryable websocket errors terminal - update the design note so retryable external failures include websocket capacity exhaustion Rationale: - the shared external gateway currently reports account exhaustion even though the legacy provider is healthy, so treating it as terminal left real route requests failing with 502 unnecessarily - classifying account exhaustion as retryable preserves the explicit error semantics inside the adapter while restoring user-facing audio output via the existing legacy fallback path Tests: - bun test apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/provider.test.ts - bun test apps/web/src/lib/tts/fetch-with-timeout.test.ts apps/web/src/lib/tts/legacy.test.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/provider.test.ts apps/web/src/lib/tts/service.test.ts apps/web/src/app/api/tts/generate/route.test.ts - bunx tsc -p apps/web/tsconfig.json --noEmit - bunx @biomejs/biome check apps/web/src/lib/tts/openai-compatible.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/provider.test.ts docs/plans/2026-03-17-tts-external-provider-design.md - cd apps/web && NEXT_PUBLIC_SITE_URL=http://localhost:4100 UPSTASH_REDIS_REST_URL=https://example.com UPSTASH_REDIS_REST_TOKEN=dummy NODE_ENV=production bun run build - NODE_ENV=development NEXT_PUBLIC_SITE_URL=http://localhost:4311 UPSTASH_REDIS_REST_URL=https://example.com UPSTASH_REDIS_REST_TOKEN=dummy bun --eval 'import { NextRequest } from "next/server"; import { POST } from "./apps/web/src/app/api/tts/generate/route.ts"; /* status=200 audioLength=34364 */' Co-authored-by: Codex --- .../web/src/lib/tts/openai-compatible.test.ts | 4 +-- apps/web/src/lib/tts/openai-compatible.ts | 1 - apps/web/src/lib/tts/provider.test.ts | 31 ++++++++++++++++++- ...2026-03-17-tts-external-provider-design.md | 2 +- 4 files changed, 33 insertions(+), 5 deletions(-) diff --git a/apps/web/src/lib/tts/openai-compatible.test.ts b/apps/web/src/lib/tts/openai-compatible.test.ts index 2372246..3fc238a 100644 --- a/apps/web/src/lib/tts/openai-compatible.test.ts +++ b/apps/web/src/lib/tts/openai-compatible.test.ts @@ -563,7 +563,7 @@ describe("synthesizeSpeechWithOpenAiCompatible", () => { ); }); - test("surfaces websocket close reasons as structured upstream errors", async () => { + test("marks websocket account exhaustion as retryable so legacy fallback can recover", async () => { const sockets: FakeWebSocket[] = []; const synthesis = synthesizeSpeechWithOpenAiCompatible({ config: { @@ -591,7 +591,7 @@ describe("synthesizeSpeechWithOpenAiCompatible", () => { await expect(synthesis).rejects.toMatchObject({ code: "EXTERNAL_TTS_UPSTREAM", message: "External TTS websocket request failed: no available account", - retryable: false, + retryable: true, }); }); }); diff --git a/apps/web/src/lib/tts/openai-compatible.ts b/apps/web/src/lib/tts/openai-compatible.ts index 9eb4a3f..980e722 100644 --- a/apps/web/src/lib/tts/openai-compatible.ts +++ b/apps/web/src/lib/tts/openai-compatible.ts @@ -286,7 +286,6 @@ function getResponsesWebSocketCloseRetryable({ const normalizedReason = reason.trim().toLowerCase(); if ( - normalizedReason.includes("no available account") || normalizedReason.includes("required") || normalizedReason.includes("unsupported") ) { diff --git a/apps/web/src/lib/tts/provider.test.ts b/apps/web/src/lib/tts/provider.test.ts index 3269ce9..e214fc5 100644 --- a/apps/web/src/lib/tts/provider.test.ts +++ b/apps/web/src/lib/tts/provider.test.ts @@ -113,7 +113,36 @@ describe("synthesizeSpeechWithFallback", () => { expect(legacyCalled).toBe(false); }); - test("does not fall back when the external provider returns a non-audio success response", async () => { + test("falls back when the external provider reports no available account", async () => { + let legacyCalled = false; + + const result = await synthesizeSpeechWithFallback({ + env: { + API_BASE_URL: "https://example.com/v1", + API_MODEL: "tts-1", + API_KEY: "secret", + }, + text: "hello", + voice: "default", + openAiSynthesize: async () => { + throw new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: + "External TTS websocket request failed: no available account", + retryable: true, + }); + }, + legacySynthesize: async () => { + legacyCalled = true; + return Uint8Array.from([7, 8, 9]).buffer; + }, + }); + + expect(Array.from(new Uint8Array(result))).toEqual([7, 8, 9]); + expect(legacyCalled).toBe(true); + }); + + test("does not fall back when the external provider returns a non-retryable websocket account error", async () => { let legacyCalled = false; await expect( diff --git a/docs/plans/2026-03-17-tts-external-provider-design.md b/docs/plans/2026-03-17-tts-external-provider-design.md index cadce47..c2bf239 100644 --- a/docs/plans/2026-03-17-tts-external-provider-design.md +++ b/docs/plans/2026-03-17-tts-external-provider-design.md @@ -104,7 +104,7 @@ - 请求参数非法:返回 `400` - TTS 环境变量缺失:返回 `500`,信息明确为未配置 -- 外部 provider 返回可重试的 `EXTERNAL_TTS_UPSTREAM`(例如超时、`429`、`5xx`)时,会先回退到 legacy provider +- 外部 provider 返回可重试的 `EXTERNAL_TTS_UPSTREAM`(例如超时、`429`、`5xx`,或 websocket `no available account` 这类账号容量耗尽)时,会先回退到 legacy provider - legacy 回退成功:最终仍可能返回 `200` - legacy 回退失败:最终返回 `502` - 外部 provider 返回不可重试的 `EXTERNAL_TTS_UPSTREAM`(例如 `401`/`403`/`404`、空音频、非音频响应)时:直接返回 `502` From 5750a17e6fd46b7b400760c2881f91a53230e809 Mon Sep 17 00:00:00 2001 From: tianhei Date: Fri, 20 Mar 2026 03:31:33 +0800 Subject: [PATCH 20/22] fix(tts): address remaining rework review comments Summary: - cancel unconsumed external TTS responses before retry and websocket fallback branches - add regression tests that assert response cleanup on 404, html fallback, and non-audio success responses - keep example external TTS env vars disabled by default and extract a shared metadata-route helper in legacy tests Rationale: - releasing abandoned response bodies avoids tying up fetch connections while probing multiple upstream endpoints - disabling placeholder external env vars prevents fresh setups from accidentally enabling the external path against a fake host - absorbing the remaining test helper nit removes repeated brittle string matching from the legacy suite Tests: - bun test apps/web/src/lib/tts/fetch-with-timeout.test.ts apps/web/src/lib/tts/legacy.test.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/provider.test.ts apps/web/src/lib/tts/service.test.ts apps/web/src/app/api/tts/generate/route.test.ts - bunx tsc -p apps/web/tsconfig.json --noEmit - bunx @biomejs/biome check apps/web/src/lib/tts/openai-compatible.ts apps/web/src/lib/tts/openai-compatible.test.ts apps/web/src/lib/tts/legacy.test.ts - cd apps/web && NEXT_PUBLIC_SITE_URL=http://localhost:4100 UPSTASH_REDIS_REST_URL=https://example.com UPSTASH_REDIS_REST_TOKEN=dummy NODE_ENV=production bun run build - git diff --check Co-authored-by: Codex --- apps/web/.env.example | 6 +-- apps/web/src/lib/tts/legacy.test.ts | 15 ++++-- .../web/src/lib/tts/openai-compatible.test.ts | 47 ++++++++++++++++--- apps/web/src/lib/tts/openai-compatible.ts | 16 +++++++ 4 files changed, 69 insertions(+), 15 deletions(-) diff --git a/apps/web/.env.example b/apps/web/.env.example index 0b78b6c..cd7f63d 100644 --- a/apps/web/.env.example +++ b/apps/web/.env.example @@ -18,9 +18,9 @@ FREESOUND_API_KEY=your_api_key_here # Optional: external OpenAI-compatible TTS provider # Preferred namespaced variables: -EXTERNAL_TTS_API_BASE_URL=https://your-tts-provider.example.com/v1 -EXTERNAL_TTS_API_MODEL=your_tts_model -EXTERNAL_TTS_API_KEY=your_tts_api_key +# EXTERNAL_TTS_API_BASE_URL=https://your-tts-provider.example.com/v1 +# EXTERNAL_TTS_API_MODEL=your_tts_model +# EXTERNAL_TTS_API_KEY=your_tts_api_key # Compatibility aliases used when EXTERNAL_TTS_* is absent: # API_BASE_URL=https://your-shared-api.example.com/v1 # API_MODEL=your_tts_model diff --git a/apps/web/src/lib/tts/legacy.test.ts b/apps/web/src/lib/tts/legacy.test.ts index 950a746..c3e14ac 100644 --- a/apps/web/src/lib/tts/legacy.test.ts +++ b/apps/web/src/lib/tts/legacy.test.ts @@ -4,6 +4,7 @@ import { synthesizeSpeechWithLegacyProvider } from "./legacy"; describe("synthesizeSpeechWithLegacyProvider", () => { const TEST_TIMEOUT_MS = 50; const LEGACY_AUDIO_URL = "https://api.milorapart.top/voice/test.mp3"; + const LEGACY_METADATA_ROUTE = "/apis/mbAIsc?"; function legacyMetadataOk(url = LEGACY_AUDIO_URL): Response { return Response.json({ @@ -12,6 +13,10 @@ describe("synthesizeSpeechWithLegacyProvider", () => { }); } + function isLegacyMetadataRequest(input: RequestInfo | URL): boolean { + return String(input).includes(LEGACY_METADATA_ROUTE); + } + test("rejects audio urls outside the expected https host allowlist", async () => { const calls: string[] = []; @@ -33,7 +38,7 @@ describe("synthesizeSpeechWithLegacyProvider", () => { synthesizeSpeechWithLegacyProvider({ text: "hello", fetchImpl: async (input) => { - if (String(input).includes("/apis/mbAIsc?")) { + if (isLegacyMetadataRequest(input)) { return legacyMetadataOk(); } @@ -51,7 +56,7 @@ describe("synthesizeSpeechWithLegacyProvider", () => { synthesizeSpeechWithLegacyProvider({ text: "hello", fetchImpl: async (input) => { - if (String(input).includes("/apis/mbAIsc?")) { + if (isLegacyMetadataRequest(input)) { return legacyMetadataOk(); } @@ -67,7 +72,7 @@ describe("synthesizeSpeechWithLegacyProvider", () => { const audio = await synthesizeSpeechWithLegacyProvider({ text: "hello", fetchImpl: async (input) => { - if (String(input).includes("/apis/mbAIsc?")) { + if (isLegacyMetadataRequest(input)) { return legacyMetadataOk(); } @@ -88,7 +93,7 @@ describe("synthesizeSpeechWithLegacyProvider", () => { synthesizeSpeechWithLegacyProvider({ text: "hello", fetchImpl: async (input, init) => { - if (String(input).includes("/apis/mbAIsc?")) { + if (isLegacyMetadataRequest(input)) { return legacyMetadataOk(); } @@ -116,7 +121,7 @@ describe("synthesizeSpeechWithLegacyProvider", () => { const audio = await synthesizeSpeechWithLegacyProvider({ text: "hello", fetchImpl: async (input, init) => { - if (String(input).includes("/apis/mbAIsc?")) { + if (isLegacyMetadataRequest(input)) { return legacyMetadataOk(); } diff --git a/apps/web/src/lib/tts/openai-compatible.test.ts b/apps/web/src/lib/tts/openai-compatible.test.ts index 3fc238a..c7608fd 100644 --- a/apps/web/src/lib/tts/openai-compatible.test.ts +++ b/apps/web/src/lib/tts/openai-compatible.test.ts @@ -238,6 +238,7 @@ describe("synthesizeSpeechWithOpenAiCompatible", () => { test("falls back to the root audio speech path when the v1 path returns 404", async () => { const calls: string[] = []; + const cancelledResponses: string[] = []; const audio = await synthesizeSpeechWithOpenAiCompatible({ config: { @@ -252,7 +253,16 @@ describe("synthesizeSpeechWithOpenAiCompatible", () => { calls.push(url); if (url === "https://example.com/v1/audio/speech") { - return new Response("page not found", { status: 404 }); + return { + body: { + cancel: async () => { + cancelledResponses.push(url); + }, + }, + headers: new Headers(), + ok: false, + status: 404, + } as Response; } return new Response(Uint8Array.from([9, 8, 7]), { @@ -267,6 +277,7 @@ describe("synthesizeSpeechWithOpenAiCompatible", () => { "https://example.com/v1/audio/speech", "https://example.com/audio/speech", ]); + expect(cancelledResponses).toEqual(["https://example.com/v1/audio/speech"]); }); test("tries the /v1 speech endpoint first when the base url is root-level", async () => { @@ -300,6 +311,8 @@ describe("synthesizeSpeechWithOpenAiCompatible", () => { }); test("rejects non-audio success responses", async () => { + let cancelCalled = false; + await expect( synthesizeSpeechWithOpenAiCompatible({ config: { @@ -310,12 +323,22 @@ describe("synthesizeSpeechWithOpenAiCompatible", () => { text: "hello", voice: "default", fetchImpl: async () => - new Response("not audio", { + ({ + body: { + cancel: async () => { + cancelCalled = true; + }, + }, + headers: new Headers({ + "Content-Type": "text/plain; charset=utf-8", + }), + ok: true, status: 200, - headers: { "Content-Type": "text/plain; charset=utf-8" }, - }), + }) as Response, }), ).rejects.toThrow("Expected audio response"); + + expect(cancelCalled).toBe(true); }); test("rejects success responses when the content-type header is missing", async () => { @@ -525,6 +548,7 @@ describe("synthesizeSpeechWithOpenAiCompatible", () => { test("falls back to /responses websocket audio when /audio/speech returns html", async () => { const sockets: FakeWebSocket[] = []; + let cancelCalled = false; const synthesis = synthesizeSpeechWithOpenAiCompatible({ config: { apiBaseUrl: "https://example.com/v1", @@ -539,13 +563,22 @@ describe("synthesizeSpeechWithOpenAiCompatible", () => { return socket; }, fetchImpl: async () => - new Response("", { + ({ + body: { + cancel: async () => { + cancelCalled = true; + }, + }, + headers: new Headers({ + "Content-Type": "text/html; charset=utf-8", + }), + ok: true, status: 200, - headers: { "Content-Type": "text/html; charset=utf-8" }, - }), + }) as Response, }); await new Promise((resolve) => setTimeout(resolve, 0)); + expect(cancelCalled).toBe(true); sockets[0]?.emit("open"); sockets[0]?.emit("message", { data: JSON.stringify({ diff --git a/apps/web/src/lib/tts/openai-compatible.ts b/apps/web/src/lib/tts/openai-compatible.ts index 980e722..270c03f 100644 --- a/apps/web/src/lib/tts/openai-compatible.ts +++ b/apps/web/src/lib/tts/openai-compatible.ts @@ -276,6 +276,18 @@ function shouldTryResponsesWebSocket({ return mimeType === "text/html"; } +async function cancelResponseBody({ + response, +}: { + response: Response; +}): Promise { + try { + await response.body?.cancel(); + } catch { + // Best-effort cleanup only. + } +} + function getResponsesWebSocketCloseRetryable({ code, reason, @@ -610,9 +622,11 @@ export async function synthesizeSpeechWithOpenAiCompatible({ if (!isAudioContentType({ contentType })) { if (shouldTryResponsesWebSocket({ response })) { lastErrorResponse = response; + await cancelResponseBody({ response }); break; } + await cancelResponseBody({ response }); throw new TtsError({ code: "EXTERNAL_TTS_UPSTREAM", message: `Expected audio response, received ${contentType || "(no content-type)"}`, @@ -651,6 +665,8 @@ export async function synthesizeSpeechWithOpenAiCompatible({ if (response.status !== 404) { break; } + + await cancelResponseBody({ response }); } if ( From 5fb0c77530421b4db3edf6235898723df24a7624 Mon Sep 17 00:00:00 2001 From: tianhei Date: Fri, 20 Mar 2026 22:30:57 +0800 Subject: [PATCH 21/22] docs(tts): clarify external model requirements Summary: - recommend a concrete TTS-capable model in README and env example - document that shared API_MODEL aliases may point at non-TTS models Rationale: - fresh live probes showed the current alias model is not a clear TTS model - clearer config guidance reduces false negatives during runtime validation Tests: - git diff --check Co-authored-by: Codex --- README.md | 7 ++++++- apps/web/.env.example | 4 +++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7d43b9b..46fdcd9 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ Optional TTS env values: ```bash EXTERNAL_TTS_API_BASE_URL="https://your-tts-provider.example.com/v1" -EXTERNAL_TTS_API_MODEL="your_tts_model" +EXTERNAL_TTS_API_MODEL="gpt-4o-mini-tts" EXTERNAL_TTS_API_KEY="your_tts_api_key" ``` @@ -84,6 +84,11 @@ Cutia prefers `EXTERNAL_TTS_API_*` for external speech synthesis. The legacy `API_BASE_URL` / `API_MODEL` / `API_KEY` names are still accepted as compatibility aliases when the namespaced variables are absent. +Use a provider-supported TTS model for `EXTERNAL_TTS_API_MODEL` (for example +`gpt-4o-mini-tts` or another audio-output model that your provider actually +supports). The shared `API_MODEL` alias is only a migration fallback and may +already point at a non-TTS chat model in your environment. + To verify that the configured provider can actually return audio, run: ```bash diff --git a/apps/web/.env.example b/apps/web/.env.example index cd7f63d..00af487 100644 --- a/apps/web/.env.example +++ b/apps/web/.env.example @@ -19,8 +19,10 @@ FREESOUND_API_KEY=your_api_key_here # Optional: external OpenAI-compatible TTS provider # Preferred namespaced variables: # EXTERNAL_TTS_API_BASE_URL=https://your-tts-provider.example.com/v1 -# EXTERNAL_TTS_API_MODEL=your_tts_model +# EXTERNAL_TTS_API_MODEL=gpt-4o-mini-tts # EXTERNAL_TTS_API_KEY=your_tts_api_key +# Use a provider-supported audio/TTS model here. Shared API_MODEL values are +# often general chat models and may not work for speech generation. # Compatibility aliases used when EXTERNAL_TTS_* is absent: # API_BASE_URL=https://your-shared-api.example.com/v1 # API_MODEL=your_tts_model From 2e71fe384548767c6231036412c149a04894af10 Mon Sep 17 00:00:00 2001 From: tianhei Date: Fri, 20 Mar 2026 22:42:43 +0800 Subject: [PATCH 22/22] docs(tts): add provider capability troubleshooting Summary: - document the provider-side prerequisites for successful TTS probes - explain how /models and audio endpoints affect runtime validation Rationale: - recent investigation showed live probe failures were caused by provider capabilities and upstream availability, not local code regressions - keeping that guidance in repo docs reduces repeated misdiagnosis Tests: - git diff --check Co-authored-by: Codex --- README.md | 11 +++++++++++ apps/web/.env.example | 2 ++ 2 files changed, 13 insertions(+) diff --git a/README.md b/README.md index 46fdcd9..89fb315 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,17 @@ Use a provider-supported TTS model for `EXTERNAL_TTS_API_MODEL` (for example supports). The shared `API_MODEL` alias is only a migration fallback and may already point at a non-TTS chat model in your environment. +Before treating a failed probe as a code regression, confirm the provider +itself is TTS-capable for the current credentials: + +- `/models` should list the configured TTS model or another audio-capable model +- either `/audio/speech` must return audio directly, or `/responses` must accept + audio output requests for the configured model +- if `/audio/speech` returns `404` and `/models` contains only chat/text models, + the provider is not exposing a usable TTS surface for this environment +- legacy fallback is best-effort only; if the legacy upstream is unavailable, + route probes will still return `502` + To verify that the configured provider can actually return audio, run: ```bash diff --git a/apps/web/.env.example b/apps/web/.env.example index 00af487..0de5987 100644 --- a/apps/web/.env.example +++ b/apps/web/.env.example @@ -23,6 +23,8 @@ FREESOUND_API_KEY=your_api_key_here # EXTERNAL_TTS_API_KEY=your_tts_api_key # Use a provider-supported audio/TTS model here. Shared API_MODEL values are # often general chat models and may not work for speech generation. +# The provider should also expose that model from /models and support either +# /audio/speech or /responses audio output for the same credentials. # Compatibility aliases used when EXTERNAL_TTS_* is absent: # API_BASE_URL=https://your-shared-api.example.com/v1 # API_MODEL=your_tts_model