diff --git a/README.md b/README.md index e26ac14..89fb315 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,46 @@ UPSTASH_REDIS_REST_TOKEN="cutia_redis_token" NODE_ENV="development" ``` +Optional TTS env values: + +```bash +EXTERNAL_TTS_API_BASE_URL="https://your-tts-provider.example.com/v1" +EXTERNAL_TTS_API_MODEL="gpt-4o-mini-tts" +EXTERNAL_TTS_API_KEY="your_tts_api_key" +``` + +Cutia prefers `EXTERNAL_TTS_API_*` for external speech synthesis. The legacy +`API_BASE_URL` / `API_MODEL` / `API_KEY` names are still accepted as +compatibility aliases when the namespaced variables are absent. + +Use a provider-supported TTS model for `EXTERNAL_TTS_API_MODEL` (for example +`gpt-4o-mini-tts` or another audio-output model that your provider actually +supports). The shared `API_MODEL` alias is only a migration fallback and may +already point at a non-TTS chat model in your environment. + +Before treating a failed probe as a code regression, confirm the provider +itself is TTS-capable for the current credentials: + +- `/models` should list the configured TTS model or another audio-capable model +- either `/audio/speech` must return audio directly, or `/responses` must accept + audio output requests for the configured model +- if `/audio/speech` returns `404` and `/models` contains only chat/text models, + the provider is not exposing a usable TTS surface for this environment +- legacy fallback is best-effort only; if the legacy upstream is unavailable, + route probes will still return `502` + +To verify that the configured provider can actually return audio, run: + +```bash +bun --eval 'import { getExternalTtsConfig, synthesizeSpeechWithOpenAiCompatible } from "./src/lib/tts/openai-compatible.ts"; const config = getExternalTtsConfig({ env: process.env }); const audio = await synthesizeSpeechWithOpenAiCompatible({ config, text: "Cutia TTS probe", voice: "default" }); console.log(audio.byteLength);' +``` + +If you want to verify the route end-to-end from the app directory, run: + +```bash +NODE_ENV=development NEXT_PUBLIC_SITE_URL=http://localhost:3000 UPSTASH_REDIS_REST_URL=http://localhost:8079 UPSTASH_REDIS_REST_TOKEN=cutia_redis_token bun --eval 'import { NextRequest } from "next/server"; import { POST } from "./src/app/api/tts/generate/route.ts"; const request = new NextRequest("http://localhost/api/tts/generate", { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ text: "Cutia route probe", voice: "default" }) }); const response = await POST(request); console.log(response.status); console.log(await response.text());' +``` + To enable authentication, also start PostgreSQL and add these env values: ```bash diff --git a/apps/web/.env.example b/apps/web/.env.example index 85e483b..0de5987 100644 --- a/apps/web/.env.example +++ b/apps/web/.env.example @@ -16,9 +16,23 @@ UPSTASH_REDIS_REST_TOKEN=example_token_here FREESOUND_CLIENT_ID=your_client_id_here FREESOUND_API_KEY=your_api_key_here +# Optional: external OpenAI-compatible TTS provider +# Preferred namespaced variables: +# EXTERNAL_TTS_API_BASE_URL=https://your-tts-provider.example.com/v1 +# EXTERNAL_TTS_API_MODEL=gpt-4o-mini-tts +# EXTERNAL_TTS_API_KEY=your_tts_api_key +# Use a provider-supported audio/TTS model here. Shared API_MODEL values are +# often general chat models and may not work for speech generation. +# The provider should also expose that model from /models and support either +# /audio/speech or /responses audio output for the same credentials. +# Compatibility aliases used when EXTERNAL_TTS_* is absent: +# API_BASE_URL=https://your-shared-api.example.com/v1 +# API_MODEL=your_tts_model +# API_KEY=your_tts_api_key + # Cloudflare R2 (for reference image uploads) R2_ACCOUNT_ID=your_r2_account_id R2_ACCESS_KEY_ID=your_r2_access_key_id R2_SECRET_ACCESS_KEY=your_r2_secret_access_key R2_BUCKET_NAME=your_r2_bucket_name -R2_PUBLIC_URL=https://your-r2-public-url.example.com \ No newline at end of file +R2_PUBLIC_URL=https://your-r2-public-url.example.com diff --git a/apps/web/src/app/api/tts/generate/route.test.ts b/apps/web/src/app/api/tts/generate/route.test.ts new file mode 100644 index 0000000..6250a9f --- /dev/null +++ b/apps/web/src/app/api/tts/generate/route.test.ts @@ -0,0 +1,113 @@ +import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test"; +import { TtsError } from "@/lib/tts/errors"; +import { NextRequest } from "next/server"; + +let synthesizeImpl: typeof import("@/lib/tts/provider").synthesizeSpeechWithFallback; +const originalConsoleError = console.error; + +mock.module("@cutia/env/web", () => ({ + webEnv: { + API_BASE_URL: "https://example.com/v1", + API_MODEL: "tts-1", + API_KEY: "secret", + }, +})); + +mock.module("@/lib/tts/provider", () => ({ + synthesizeSpeechWithFallback: (args: Parameters[0]) => + synthesizeImpl(args), +})); + +const { POST } = await import("./route"); + +function createRequest(body: unknown): NextRequest { + return new NextRequest("http://localhost/api/tts/generate", { + body: JSON.stringify(body), + headers: { + "content-type": "application/json", + }, + method: "POST", + }); +} + +describe("POST /api/tts/generate", () => { + beforeEach(() => { + console.error = mock(() => {}); + synthesizeImpl = async () => Uint8Array.from([1, 2, 3]).buffer; + }); + + afterEach(() => { + console.error = originalConsoleError; + }); + + test("returns base64 audio for successful synthesis", async () => { + const response = await POST(createRequest({ text: "hello" })); + + expect(response.status).toBe(200); + expect(await response.json()).toEqual({ + audio: "AQID", + }); + }); + + test("returns 502 for structured legacy upstream errors without relying on message prefixes", async () => { + synthesizeImpl = async () => { + throw new TtsError({ + code: "LEGACY_TTS_UPSTREAM", + message: "legacy fallback audio download failed", + }); + }; + + const response = await POST(createRequest({ text: "hello" })); + + expect(response.status).toBe(502); + expect(await response.json()).toEqual({ + error: "legacy fallback audio download failed", + }); + }); + + test("returns 502 for structured external upstream errors without relying on message prefixes", async () => { + synthesizeImpl = async () => { + throw new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: "upstream gateway timeout", + }); + }; + + const response = await POST(createRequest({ text: "hello" })); + + expect(response.status).toBe(502); + expect(await response.json()).toEqual({ + error: "upstream gateway timeout", + }); + }); + + test("returns the original config error message for structured config failures", async () => { + synthesizeImpl = async () => { + throw new TtsError({ + code: "EXTERNAL_TTS_CONFIG", + message: "external config missing", + }); + }; + + const response = await POST(createRequest({ text: "hello" })); + + expect(response.status).toBe(500); + expect(await response.json()).toEqual({ + error: "external config missing", + }); + }); + + test("returns 500 for unexpected non-TtsError exceptions", async () => { + synthesizeImpl = async () => { + throw new Error("unexpected failure"); + }; + + const response = await POST(createRequest({ text: "hello" })); + + expect(response.status).toBe(500); + expect(await response.json()).toEqual({ + error: "Internal server error", + detail: "unexpected failure", + }); + }); +}); diff --git a/apps/web/src/app/api/tts/generate/route.ts b/apps/web/src/app/api/tts/generate/route.ts index 6767f75..82ffde3 100644 --- a/apps/web/src/app/api/tts/generate/route.ts +++ b/apps/web/src/app/api/tts/generate/route.ts @@ -1,18 +1,14 @@ +import { webEnv } from "@cutia/env/web"; import { type NextRequest, NextResponse } from "next/server"; import { z } from "zod"; - -const TTS_API_BASE = "https://api.milorapart.top/apis/mbAIsc"; +import { isTtsError } from "@/lib/tts/errors"; +import { synthesizeSpeechWithFallback } from "@/lib/tts/provider"; const requestSchema = z.object({ text: z.string().min(1, "Text is required").max(2000, "Text too long"), voice: z.string().optional(), }); -const upstreamResponseSchema = z.object({ - code: z.number(), - url: z.string().url(), -}); - export async function POST(request: NextRequest) { try { const body = await request.json(); @@ -28,42 +24,33 @@ export async function POST(request: NextRequest) { ); } - const { text } = validation.data; - const upstreamUrl = `${TTS_API_BASE}?${new URLSearchParams({ text, format: "mp3" })}`; - const upstreamResponse = await fetch(upstreamUrl); - - if (!upstreamResponse.ok) { - return NextResponse.json( - { error: `Upstream error: ${upstreamResponse.status}` }, - { status: 502 }, - ); - } - - const upstreamData = await upstreamResponse.json(); - const parsed = upstreamResponseSchema.safeParse(upstreamData); - - if (!parsed.success || parsed.data.code !== 200) { - return NextResponse.json( - { error: "TTS generation failed" }, - { status: 502 }, - ); - } - - const audioResponse = await fetch(parsed.data.url); - if (!audioResponse.ok) { - return NextResponse.json( - { error: `Failed to download audio: ${audioResponse.status}` }, - { status: 502 }, - ); - } - - const audioArrayBuffer = await audioResponse.arrayBuffer(); + const { text, voice } = validation.data; + const audioArrayBuffer = await synthesizeSpeechWithFallback({ + env: webEnv, + text, + voice, + }); const base64 = Buffer.from(audioArrayBuffer).toString("base64"); return NextResponse.json({ audio: base64 }); } catch (error) { const message = error instanceof Error ? error.message : "Unknown error"; console.error("TTS generate error:", error); + + if (isTtsError(error)) { + switch (error.code) { + case "EXTERNAL_TTS_CONFIG": + return NextResponse.json({ error: message }, { status: 500 }); + case "EXTERNAL_TTS_UPSTREAM": + case "LEGACY_TTS_UPSTREAM": + return NextResponse.json({ error: message }, { status: 502 }); + default: { + const exhaustiveCode: never = error.code; + throw new Error(`Unhandled TTS error code: ${exhaustiveCode}`); + } + } + } + return NextResponse.json( { error: "Internal server error", detail: message }, { status: 500 }, diff --git a/apps/web/src/constants/tts-constants.ts b/apps/web/src/constants/tts-constants.ts index 60c4084..5b27045 100644 --- a/apps/web/src/constants/tts-constants.ts +++ b/apps/web/src/constants/tts-constants.ts @@ -3,8 +3,7 @@ export interface VoicePack { name: string; } -export const VOICE_PACKS: VoicePack[] = [ - { id: "default", name: "Default" }, -]; +export const VOICE_PACKS: VoicePack[] = [{ id: "default", name: "Default" }]; export const DEFAULT_VOICE_PACK = "default"; +export const DEFAULT_EXTERNAL_TTS_VOICE = "alloy"; diff --git a/apps/web/src/lib/tts/errors.ts b/apps/web/src/lib/tts/errors.ts new file mode 100644 index 0000000..639b9ee --- /dev/null +++ b/apps/web/src/lib/tts/errors.ts @@ -0,0 +1,39 @@ +export const TTS_ERROR_CODES = [ + "EXTERNAL_TTS_CONFIG", + "EXTERNAL_TTS_UPSTREAM", + "LEGACY_TTS_UPSTREAM", +] as const; + +export type TtsErrorCode = (typeof TTS_ERROR_CODES)[number]; + +export class TtsError extends Error { + code: TtsErrorCode; + retryable?: boolean; + status?: number; + + constructor({ + code, + message, + retryable, + status, + }: { + code: TtsErrorCode; + message: string; + retryable?: boolean; + status?: number; + }) { + super(message); + this.name = "TtsError"; + this.code = code; + this.retryable = retryable; + this.status = status; + } +} + +export function isTtsError(error: unknown): error is TtsError { + if (!(error instanceof Error)) { + return false; + } + + return TTS_ERROR_CODES.includes((error as TtsError).code); +} diff --git a/apps/web/src/lib/tts/fetch-with-timeout.test.ts b/apps/web/src/lib/tts/fetch-with-timeout.test.ts new file mode 100644 index 0000000..8652a1f --- /dev/null +++ b/apps/web/src/lib/tts/fetch-with-timeout.test.ts @@ -0,0 +1,87 @@ +import { describe, expect, test } from "bun:test"; +import { fetchWithTimeout } from "./fetch-with-timeout"; + +describe("fetchWithTimeout", () => { + test("resolves successfully when fetch completes before the timeout", async () => { + let fetchCalled = false; + + const response = await fetchWithTimeout({ + fetchImpl: async () => { + fetchCalled = true; + return new Response("ok", { status: 200 }); + }, + input: "https://example.com", + timeoutMessage: "timed out", + timeoutMs: 50, + }); + + expect(fetchCalled).toBe(true); + expect(response.status).toBe(200); + expect(await response.text()).toBe("ok"); + }); + + test("rejects immediately when the caller signal is already aborted", async () => { + const controller = new AbortController(); + const callerError = new Error("caller aborted"); + let fetchCalled = false; + + controller.abort(callerError); + + await expect( + fetchWithTimeout({ + fetchImpl: async () => { + fetchCalled = true; + return new Response("ok"); + }, + init: { signal: controller.signal }, + input: "https://example.com", + timeoutMessage: "timed out", + timeoutMs: 50, + }), + ).rejects.toThrow("caller aborted"); + + expect(fetchCalled).toBe(false); + }); + + test("surfaces caller cancellation for in-flight requests", async () => { + const controller = new AbortController(); + const callerError = new Error("caller aborted"); + + await expect( + fetchWithTimeout({ + fetchImpl: async (_input, init) => + new Promise((_resolve, reject) => { + setTimeout(() => controller.abort(callerError), 0); + + init?.signal?.addEventListener( + "abort", + () => reject(init.signal?.reason ?? new Error("aborted")), + { once: true }, + ); + }), + init: { signal: controller.signal }, + input: "https://example.com", + timeoutMessage: "timed out", + timeoutMs: 50, + }), + ).rejects.toThrow("caller aborted"); + }); + + test("rejects with the timeout message when fetch exceeds timeoutMs", async () => { + await expect( + fetchWithTimeout({ + fetchImpl: async (_input, init) => + new Promise((_resolve, reject) => { + init?.signal?.addEventListener( + "abort", + () => reject(new Error("aborted")), + { once: true }, + ); + }), + input: "https://example.com", + timeoutMessage: "timed out", + timeoutMs: 10, + }), + ).rejects.toThrow("timed out"); + }); +}); diff --git a/apps/web/src/lib/tts/fetch-with-timeout.ts b/apps/web/src/lib/tts/fetch-with-timeout.ts new file mode 100644 index 0000000..47ab95b --- /dev/null +++ b/apps/web/src/lib/tts/fetch-with-timeout.ts @@ -0,0 +1,62 @@ +export type FetchLike = ( + input: RequestInfo | URL, + init?: RequestInit, +) => Promise; + +function throwCallerAbortReason(signal: AbortSignal): never { + if (signal.reason instanceof Error) { + throw signal.reason; + } + + throw new Error(String(signal.reason ?? "Request aborted")); +} + +export async function fetchWithTimeout({ + fetchImpl, + input, + init, + timeoutMs, + timeoutMessage, +}: { + fetchImpl: FetchLike; + input: RequestInfo | URL; + init?: RequestInit; + timeoutMs: number; + timeoutMessage: string; +}): Promise { + const controller = new AbortController(); + const callerSignal = init?.signal; + let didTimeout = false; + const abortFromCaller = () => controller.abort(callerSignal?.reason); + + if (callerSignal?.aborted) { + throwCallerAbortReason(callerSignal); + } + + callerSignal?.addEventListener("abort", abortFromCaller, { once: true }); + + const timeoutId = setTimeout(() => { + didTimeout = true; + controller.abort(new Error(timeoutMessage)); + }, timeoutMs); + + try { + return await fetchImpl(input, { + ...init, + signal: controller.signal, + }); + } catch (error) { + if (didTimeout) { + throw new Error(timeoutMessage); + } + + if (callerSignal?.aborted) { + throwCallerAbortReason(callerSignal); + } + + throw error; + } finally { + clearTimeout(timeoutId); + callerSignal?.removeEventListener("abort", abortFromCaller); + } +} diff --git a/apps/web/src/lib/tts/legacy.test.ts b/apps/web/src/lib/tts/legacy.test.ts new file mode 100644 index 0000000..c3e14ac --- /dev/null +++ b/apps/web/src/lib/tts/legacy.test.ts @@ -0,0 +1,220 @@ +import { describe, expect, test } from "bun:test"; +import { synthesizeSpeechWithLegacyProvider } from "./legacy"; + +describe("synthesizeSpeechWithLegacyProvider", () => { + const TEST_TIMEOUT_MS = 50; + const LEGACY_AUDIO_URL = "https://api.milorapart.top/voice/test.mp3"; + const LEGACY_METADATA_ROUTE = "/apis/mbAIsc?"; + + function legacyMetadataOk(url = LEGACY_AUDIO_URL): Response { + return Response.json({ + code: 200, + url, + }); + } + + function isLegacyMetadataRequest(input: RequestInfo | URL): boolean { + return String(input).includes(LEGACY_METADATA_ROUTE); + } + + test("rejects audio urls outside the expected https host allowlist", async () => { + const calls: string[] = []; + + await expect( + synthesizeSpeechWithLegacyProvider({ + text: "hello", + fetchImpl: async (input) => { + calls.push(String(input)); + return legacyMetadataOk("http://127.0.0.1/internal.mp3"); + }, + }), + ).rejects.toThrow("Legacy TTS returned an unexpected audio URL"); + + expect(calls).toHaveLength(1); + }); + + test("rejects non-audio content returned by the legacy audio download", async () => { + await expect( + synthesizeSpeechWithLegacyProvider({ + text: "hello", + fetchImpl: async (input) => { + if (isLegacyMetadataRequest(input)) { + return legacyMetadataOk(); + } + + return new Response("", { + status: 200, + headers: { "Content-Type": "text/html; charset=utf-8" }, + }); + }, + }), + ).rejects.toThrow("Legacy TTS returned non-audio content"); + }); + + test("rejects audio downloads when the content-type header is missing", async () => { + await expect( + synthesizeSpeechWithLegacyProvider({ + text: "hello", + fetchImpl: async (input) => { + if (isLegacyMetadataRequest(input)) { + return legacyMetadataOk(); + } + + return new Response(Uint8Array.from([1, 2, 3]), { + status: 200, + }); + }, + }), + ).rejects.toThrow("Legacy TTS returned non-audio content"); + }); + + test("accepts audio downloads when the MIME type casing and parameters vary", async () => { + const audio = await synthesizeSpeechWithLegacyProvider({ + text: "hello", + fetchImpl: async (input) => { + if (isLegacyMetadataRequest(input)) { + return legacyMetadataOk(); + } + + return new Response(Uint8Array.from([1, 2, 3]), { + status: 200, + headers: { "Content-Type": "Audio/MPEG; Charset=utf-8" }, + }); + }, + }); + + expect(Array.from(new Uint8Array(audio))).toEqual([1, 2, 3]); + }); + + test("rejects redirected audio downloads that leave the allowlist", async () => { + let sawManualRedirect = false; + + await expect( + synthesizeSpeechWithLegacyProvider({ + text: "hello", + fetchImpl: async (input, init) => { + if (isLegacyMetadataRequest(input)) { + return legacyMetadataOk(); + } + + sawManualRedirect = init?.redirect === "manual"; + + return new Response(null, { + status: 302, + headers: { + location: "https://evil.example.com/payload.mp3", + }, + }); + }, + }), + ).rejects.toMatchObject({ + code: "LEGACY_TTS_UPSTREAM", + message: "Legacy TTS audio download redirected to an unexpected host", + }); + + expect(sawManualRedirect).toBe(true); + }); + + test("follows allowlisted redirects for legacy audio downloads", async () => { + let downloadCallCount = 0; + + const audio = await synthesizeSpeechWithLegacyProvider({ + text: "hello", + fetchImpl: async (input, init) => { + if (isLegacyMetadataRequest(input)) { + return legacyMetadataOk(); + } + + downloadCallCount++; + + if (downloadCallCount === 1) { + expect(init?.redirect).toBe("manual"); + + return new Response(null, { + status: 302, + headers: { + location: "https://api.milorapart.top/voice/test-redirected.mp3", + }, + }); + } + + expect(init?.redirect).toBe("error"); + expect(String(input)).toBe( + "https://api.milorapart.top/voice/test-redirected.mp3", + ); + return new Response(Uint8Array.from([4, 5, 6]), { + status: 200, + headers: { "Content-Type": "audio/mpeg" }, + }); + }, + }); + + expect(downloadCallCount).toBe(2); + expect(Array.from(new Uint8Array(audio))).toEqual([4, 5, 6]); + }); + + test("rejects synthesis text that would exceed the legacy GET limit", async () => { + let fetchCalled = false; + + await expect( + synthesizeSpeechWithLegacyProvider({ + text: "中".repeat(400), + fetchImpl: async () => { + fetchCalled = true; + return legacyMetadataOk(); + }, + }), + ).rejects.toThrow("Legacy TTS text is too long for GET fallback"); + + expect(fetchCalled).toBe(false); + }); + + test("aborts the metadata request when the upstream hangs", async () => { + await expect( + synthesizeSpeechWithLegacyProvider({ + text: "hello", + timeoutMs: TEST_TIMEOUT_MS, + fetchImpl: async (_input, init) => + new Promise((_resolve, reject) => { + init?.signal?.addEventListener( + "abort", + () => reject(new Error("aborted")), + { once: true }, + ); + }), + }), + ).rejects.toMatchObject({ + code: "LEGACY_TTS_UPSTREAM", + message: "Legacy TTS request timed out", + }); + }); + + test("aborts the audio download when the legacy audio fetch hangs", async () => { + let callCount = 0; + + await expect( + synthesizeSpeechWithLegacyProvider({ + text: "hello", + timeoutMs: TEST_TIMEOUT_MS, + fetchImpl: async (_input, init) => { + callCount++; + + if (callCount === 1) { + return legacyMetadataOk(); + } + + return new Promise((_resolve, reject) => { + init?.signal?.addEventListener( + "abort", + () => reject(new Error("aborted")), + { once: true }, + ); + }); + }, + }), + ).rejects.toMatchObject({ + code: "LEGACY_TTS_UPSTREAM", + message: "Legacy TTS audio download timed out", + }); + }); +}); diff --git a/apps/web/src/lib/tts/legacy.ts b/apps/web/src/lib/tts/legacy.ts new file mode 100644 index 0000000..f0e8212 --- /dev/null +++ b/apps/web/src/lib/tts/legacy.ts @@ -0,0 +1,187 @@ +import { z } from "zod"; +import { fetchWithTimeout, type FetchLike } from "./fetch-with-timeout"; +import { TtsError } from "./errors"; + +const LEGACY_TTS_API_BASE = "https://api.milorapart.top/apis/mbAIsc"; +const LEGACY_TTS_ALLOWED_AUDIO_HOSTS = new Set(["api.milorapart.top"]); +const LEGACY_TTS_TIMEOUT_MS = 15_000; +const LEGACY_TTS_MAX_URL_LENGTH = 1_800; + +const legacyResponseSchema = z.object({ + code: z.number(), + url: z.string().url(), +}); + +function isRedirectStatus(status: number): boolean { + return status >= 300 && status < 400; +} + +function wrapLegacyUpstreamError({ error }: { error: unknown }): TtsError { + if (error instanceof TtsError) { + return error; + } + + return new TtsError({ + code: "LEGACY_TTS_UPSTREAM", + message: + error instanceof Error ? error.message : "Legacy TTS generation failed", + }); +} + +export async function synthesizeSpeechWithLegacyProvider({ + text, + voice: _voice, + fetchImpl = fetch, + timeoutMs = LEGACY_TTS_TIMEOUT_MS, +}: { + text: string; + voice?: string; + fetchImpl?: FetchLike; + timeoutMs?: number; +}): Promise { + void _voice; // Legacy upstream has a fixed voice; keep the arg for parity. + + const query = new URLSearchParams({ + format: "mp3", + text, + }).toString(); + const upstreamUrl = `${LEGACY_TTS_API_BASE}?${query}`; + + if (upstreamUrl.length > LEGACY_TTS_MAX_URL_LENGTH) { + throw new TtsError({ + code: "LEGACY_TTS_UPSTREAM", + message: "Legacy TTS text is too long for GET fallback", + }); + } + + let upstreamResponse: Response; + + try { + upstreamResponse = await fetchWithTimeout({ + fetchImpl, + input: upstreamUrl, + timeoutMessage: "Legacy TTS request timed out", + timeoutMs, + }); + } catch (error) { + throw wrapLegacyUpstreamError({ error }); + } + + if (!upstreamResponse.ok) { + throw new TtsError({ + code: "LEGACY_TTS_UPSTREAM", + message: `Legacy TTS request failed: ${upstreamResponse.status}`, + }); + } + + const upstreamJson = await upstreamResponse.json().catch(() => null); + const parsed = legacyResponseSchema.safeParse(upstreamJson); + + if (!parsed.success || parsed.data.code !== 200) { + throw new TtsError({ + code: "LEGACY_TTS_UPSTREAM", + message: "Legacy TTS generation failed", + }); + } + + const audioUrl = new URL(parsed.data.url); + + if ( + audioUrl.protocol !== "https:" || + !LEGACY_TTS_ALLOWED_AUDIO_HOSTS.has(audioUrl.hostname) + ) { + throw new TtsError({ + code: "LEGACY_TTS_UPSTREAM", + message: "Legacy TTS returned an unexpected audio URL", + }); + } + + let audioResponse: Response; + + try { + audioResponse = await fetchWithTimeout({ + fetchImpl, + init: { redirect: "manual" }, + input: audioUrl, + timeoutMessage: "Legacy TTS audio download timed out", + timeoutMs, + }); + } catch (error) { + throw wrapLegacyUpstreamError({ error }); + } + + if (isRedirectStatus(audioResponse.status)) { + const location = audioResponse.headers.get("location"); + + if (!location) { + throw new TtsError({ + code: "LEGACY_TTS_UPSTREAM", + message: `Legacy TTS audio download failed: ${audioResponse.status}`, + }); + } + + let redirectUrl: URL; + + try { + redirectUrl = new URL(location, audioUrl); + } catch { + throw new TtsError({ + code: "LEGACY_TTS_UPSTREAM", + message: "Legacy TTS audio download redirected to an invalid URL", + }); + } + + if ( + redirectUrl.protocol !== "https:" || + !LEGACY_TTS_ALLOWED_AUDIO_HOSTS.has(redirectUrl.hostname) + ) { + throw new TtsError({ + code: "LEGACY_TTS_UPSTREAM", + message: "Legacy TTS audio download redirected to an unexpected host", + }); + } + + try { + audioResponse = await fetchWithTimeout({ + fetchImpl, + init: { redirect: "error" }, + input: redirectUrl, + timeoutMessage: "Legacy TTS audio download timed out", + timeoutMs, + }); + } catch (error) { + throw wrapLegacyUpstreamError({ error }); + } + } + + if (!audioResponse.ok) { + throw new TtsError({ + code: "LEGACY_TTS_UPSTREAM", + message: `Legacy TTS audio download failed: ${audioResponse.status}`, + }); + } + + const contentType = audioResponse.headers.get("content-type") ?? ""; + const mimeType = contentType.split(";")[0]?.trim().toLowerCase() ?? ""; + + if ( + !mimeType.startsWith("audio/") && + mimeType !== "application/octet-stream" + ) { + throw new TtsError({ + code: "LEGACY_TTS_UPSTREAM", + message: `Legacy TTS returned non-audio content: ${contentType || "(no content-type)"}`, + }); + } + + const audio = await audioResponse.arrayBuffer(); + + if (audio.byteLength === 0) { + throw new TtsError({ + code: "LEGACY_TTS_UPSTREAM", + message: "Legacy TTS returned empty audio", + }); + } + + return audio; +} diff --git a/apps/web/src/lib/tts/openai-compatible.test.ts b/apps/web/src/lib/tts/openai-compatible.test.ts new file mode 100644 index 0000000..c7608fd --- /dev/null +++ b/apps/web/src/lib/tts/openai-compatible.test.ts @@ -0,0 +1,630 @@ +import { describe, expect, test } from "bun:test"; +import { + DEFAULT_EXTERNAL_TTS_VOICE, + getExternalTtsConfig, + synthesizeSpeechWithOpenAiCompatible, +} from "./openai-compatible"; + +type WebSocketListenerMap = { + close: Array<(event: { code: number; reason: string }) => void>; + error: Array<(event: { message?: string; type?: string }) => void>; + message: Array<(event: { data: unknown }) => void>; + open: Array<() => void>; +}; + +class FakeWebSocket { + public readonly sentMessages: string[] = []; + private readonly listeners: WebSocketListenerMap = { + close: [], + error: [], + message: [], + open: [], + }; + + constructor( + public readonly url: string, + public readonly init?: { headers?: Record }, + ) {} + + addEventListener( + type: "close", + listener: WebSocketListenerMap["close"][number], + ): void; + addEventListener( + type: "error", + listener: WebSocketListenerMap["error"][number], + ): void; + addEventListener( + type: "message", + listener: WebSocketListenerMap["message"][number], + ): void; + addEventListener( + type: "open", + listener: WebSocketListenerMap["open"][number], + ): void; + addEventListener( + type: keyof WebSocketListenerMap, + listener: WebSocketListenerMap[keyof WebSocketListenerMap][number], + ) { + ( + this.listeners[type] as Array< + ( + event?: + | { code: number; reason: string } + | { message?: string; type?: string } + | { data: unknown }, + ) => void + > + ).push(listener as (event?: unknown) => void); + } + + close(code = 1000, reason = "") { + this.emit("close", { code, reason }); + } + + emit(type: "close", event: { code: number; reason: string }): void; + emit(type: "error", event: { message?: string; type?: string }): void; + emit(type: "message", event: { data: unknown }): void; + emit(type: "open"): void; + emit(type: keyof WebSocketListenerMap, event?: unknown) { + for (const listener of this.listeners[type] as Array< + (event?: unknown) => void + >) { + listener(event); + } + } + + send(message: string) { + this.sentMessages.push(message); + } +} + +describe("getExternalTtsConfig", () => { + test("reads namespaced TTS config from environment", () => { + const config = getExternalTtsConfig({ + env: { + EXTERNAL_TTS_API_BASE_URL: "https://example.com/v1/", + EXTERNAL_TTS_API_MODEL: "tts-1", + EXTERNAL_TTS_API_KEY: "secret", + }, + }); + + expect(config).toEqual({ + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }); + }); + + test("falls back to legacy API_* aliases when namespaced TTS config is absent", () => { + const config = getExternalTtsConfig({ + env: { + API_BASE_URL: "https://example.com/v1/", + API_MODEL: "tts-1", + API_KEY: "secret", + }, + }); + + expect(config).toEqual({ + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }); + }); + + test("prefers namespaced TTS config over legacy aliases", () => { + const config = getExternalTtsConfig({ + env: { + API_BASE_URL: "https://legacy.example.com/v1/", + API_MODEL: "legacy-tts", + API_KEY: "legacy-secret", + EXTERNAL_TTS_API_BASE_URL: "https://example.com/v1/", + EXTERNAL_TTS_API_MODEL: "tts-1", + EXTERNAL_TTS_API_KEY: "secret", + }, + }); + + expect(config).toEqual({ + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }); + }); + + test("throws a clear error when config is incomplete", () => { + expect(() => + getExternalTtsConfig({ + env: { + API_BASE_URL: "https://example.com/v1", + API_KEY: "secret", + }, + }), + ).toThrow("External TTS is not configured"); + }); + + test("rejects whitespace-only config values", () => { + expect(() => + getExternalTtsConfig({ + env: { + API_BASE_URL: " ", + API_MODEL: " ", + API_KEY: " ", + }, + }), + ).toThrow("External TTS is not configured"); + }); + + test("rejects malformed API_BASE_URL values", () => { + expect(() => + getExternalTtsConfig({ + env: { + API_BASE_URL: "not-a-url", + API_MODEL: "tts-1", + API_KEY: "secret", + }, + }), + ).toThrow("External TTS is not configured"); + }); + + test("rejects non-http API_BASE_URL schemes", () => { + expect(() => + getExternalTtsConfig({ + env: { + EXTERNAL_TTS_API_BASE_URL: "mailto:tts@example.com", + EXTERNAL_TTS_API_MODEL: "tts-1", + EXTERNAL_TTS_API_KEY: "secret", + }, + }), + ).toThrow("External TTS is not configured"); + }); +}); + +describe("synthesizeSpeechWithOpenAiCompatible", () => { + test("posts audio speech requests with the mapped default voice", async () => { + const calls: Array<{ input: RequestInfo | URL; init?: RequestInit }> = []; + const audioBytes = Uint8Array.from([1, 2, 3, 4]); + + const result = await synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1/", + apiKey: "secret", + model: "tts-1", + }, + text: "你好,Cutia", + voice: "default", + fetchImpl: async (input, init) => { + calls.push({ input, init }); + return new Response(audioBytes, { + status: 200, + headers: { "Content-Type": "audio/mpeg" }, + }); + }, + }); + + expect(Array.from(new Uint8Array(result))).toEqual([1, 2, 3, 4]); + expect(calls).toHaveLength(1); + expect(calls[0]?.input).toBe("https://example.com/v1/audio/speech"); + + const headers = new Headers(calls[0]?.init?.headers); + expect(headers.get("authorization")).toBe("Bearer secret"); + expect(headers.get("content-type")).toBe("application/json"); + + expect(JSON.parse(String(calls[0]?.init?.body))).toEqual({ + input: "你好,Cutia", + model: "tts-1", + response_format: "mp3", + voice: DEFAULT_EXTERNAL_TTS_VOICE, + }); + }); + + test("surfaces upstream JSON error messages", async () => { + await expect( + synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "nova", + fetchImpl: async () => + Response.json( + { error: { message: "quota exceeded" } }, + { status: 429 }, + ), + }), + ).rejects.toThrow("quota exceeded"); + }); + + test("falls back to the root audio speech path when the v1 path returns 404", async () => { + const calls: string[] = []; + const cancelledResponses: string[] = []; + + const audio = await synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "default", + fetchImpl: async (input) => { + const url = String(input); + calls.push(url); + + if (url === "https://example.com/v1/audio/speech") { + return { + body: { + cancel: async () => { + cancelledResponses.push(url); + }, + }, + headers: new Headers(), + ok: false, + status: 404, + } as Response; + } + + return new Response(Uint8Array.from([9, 8, 7]), { + status: 200, + headers: { "Content-Type": "audio/mpeg" }, + }); + }, + }); + + expect(Array.from(new Uint8Array(audio))).toEqual([9, 8, 7]); + expect(calls).toEqual([ + "https://example.com/v1/audio/speech", + "https://example.com/audio/speech", + ]); + expect(cancelledResponses).toEqual(["https://example.com/v1/audio/speech"]); + }); + + test("tries the /v1 speech endpoint first when the base url is root-level", async () => { + const calls: string[] = []; + + const audio = await synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "default", + fetchImpl: async (input) => { + const url = String(input); + calls.push(url); + + if (url === "https://example.com/v1/audio/speech") { + return new Response(Uint8Array.from([5, 4, 3]), { + status: 200, + headers: { "Content-Type": "audio/mpeg" }, + }); + } + + return new Response("not found", { status: 404 }); + }, + }); + + expect(Array.from(new Uint8Array(audio))).toEqual([5, 4, 3]); + expect(calls[0]).toBe("https://example.com/v1/audio/speech"); + }); + + test("rejects non-audio success responses", async () => { + let cancelCalled = false; + + await expect( + synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "default", + fetchImpl: async () => + ({ + body: { + cancel: async () => { + cancelCalled = true; + }, + }, + headers: new Headers({ + "Content-Type": "text/plain; charset=utf-8", + }), + ok: true, + status: 200, + }) as Response, + }), + ).rejects.toThrow("Expected audio response"); + + expect(cancelCalled).toBe(true); + }); + + test("rejects success responses when the content-type header is missing", async () => { + await expect( + synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "default", + fetchImpl: async () => + new Response(Uint8Array.from([1, 2, 3]), { + status: 200, + }), + }), + ).rejects.toThrow("Expected audio response"); + }); + + test("accepts audio responses when MIME type casing and parameters vary", async () => { + const audio = await synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "default", + fetchImpl: async () => + new Response(Uint8Array.from([1, 2, 3]), { + status: 200, + headers: { "Content-Type": "Audio/MPEG; Charset=utf-8" }, + }), + }); + + expect(Array.from(new Uint8Array(audio))).toEqual([1, 2, 3]); + }); + + test("wraps arrayBuffer read failures as non-retryable upstream errors", async () => { + const response = new Response(Uint8Array.from([1, 2, 3]), { + status: 200, + headers: { "Content-Type": "audio/mpeg" }, + }); + Object.defineProperty(response, "arrayBuffer", { + value: async () => { + throw new Error("stream failed"); + }, + }); + + await expect( + synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "default", + fetchImpl: async () => response, + }), + ).rejects.toMatchObject({ + code: "EXTERNAL_TTS_UPSTREAM", + message: "External TTS audio read failed: stream failed", + retryable: false, + status: 200, + }); + }); + + test("aborts upstream requests that exceed the timeout", async () => { + await expect( + synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "default", + timeoutMs: 10, + fetchImpl: async (_input, init) => + new Promise((_resolve, reject) => { + init?.signal?.addEventListener( + "abort", + () => reject(new Error("aborted")), + { once: true }, + ); + }), + }), + ).rejects.toMatchObject({ + code: "EXTERNAL_TTS_UPSTREAM", + message: "External TTS request timed out", + }); + }); + + test("surfaces upstream text errors when JSON is unavailable", async () => { + await expect( + synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "nova", + fetchImpl: async () => + new Response("gateway timeout", { + status: 504, + headers: { "Content-Type": "text/plain" }, + }), + }), + ).rejects.toThrow("gateway timeout"); + }); + + test("marks auth failures as non-retryable upstream errors", async () => { + await expect( + synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "default", + fetchImpl: async () => + Response.json( + { error: { message: "invalid api key" } }, + { status: 401, statusText: "Unauthorized" }, + ), + }), + ).rejects.toMatchObject({ + code: "EXTERNAL_TTS_UPSTREAM", + message: "External TTS request failed: invalid api key", + retryable: false, + status: 401, + }); + }); + + test("falls back to the raw upstream body when JSON shape is unrecognized", async () => { + await expect( + synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "nova", + fetchImpl: async () => + new Response('{"message":"bad request"}', { + status: 400, + headers: { "Content-Type": "application/json" }, + }), + }), + ).rejects.toThrow('{"message":"bad request"}'); + }); + + test("falls back to /responses websocket audio when /audio/speech returns 404", async () => { + const sockets: FakeWebSocket[] = []; + const synthesis = synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "default", + createWebSocket: (url, init) => { + const socket = new FakeWebSocket(url, init); + sockets.push(socket); + return socket; + }, + fetchImpl: async () => new Response("page not found", { status: 404 }), + }); + await new Promise((resolve) => setTimeout(resolve, 0)); + + expect(sockets).toHaveLength(1); + expect(sockets[0]?.url).toBe("wss://example.com/v1/responses"); + expect(sockets[0]?.init?.headers?.Authorization).toBe("Bearer secret"); + + sockets[0]?.emit("open"); + expect(JSON.parse(sockets[0]?.sentMessages[0] ?? "")).toEqual({ + audio: { format: "mp3" }, + input: "hello", + model: "tts-1", + output_modalities: ["audio"], + response: { + instructions: "hello", + modalities: ["audio"], + output_audio_format: "mp3", + voice: DEFAULT_EXTERNAL_TTS_VOICE, + }, + type: "response.create", + }); + sockets[0]?.emit("message", { + data: JSON.stringify({ + type: "response.audio.delta", + delta: Buffer.from(Uint8Array.from([7, 8, 9])).toString("base64"), + }), + }); + sockets[0]?.emit("message", { + data: JSON.stringify({ type: "response.completed" }), + }); + + expect(Array.from(new Uint8Array(await synthesis))).toEqual([7, 8, 9]); + }); + + test("falls back to /responses websocket audio when /audio/speech returns html", async () => { + const sockets: FakeWebSocket[] = []; + let cancelCalled = false; + const synthesis = synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "echo", + createWebSocket: (url, init) => { + const socket = new FakeWebSocket(url, init); + sockets.push(socket); + return socket; + }, + fetchImpl: async () => + ({ + body: { + cancel: async () => { + cancelCalled = true; + }, + }, + headers: new Headers({ + "Content-Type": "text/html; charset=utf-8", + }), + ok: true, + status: 200, + }) as Response, + }); + await new Promise((resolve) => setTimeout(resolve, 0)); + + expect(cancelCalled).toBe(true); + sockets[0]?.emit("open"); + sockets[0]?.emit("message", { + data: JSON.stringify({ + type: "response.output_audio.delta", + delta: Buffer.from(Uint8Array.from([1, 2, 3, 4])).toString("base64"), + }), + }); + sockets[0]?.emit("message", { + data: JSON.stringify({ type: "response.done" }), + }); + + expect(Array.from(new Uint8Array(await synthesis))).toEqual([1, 2, 3, 4]); + expect(JSON.parse(sockets[0]?.sentMessages[0] ?? "").response.voice).toBe( + "echo", + ); + }); + + test("marks websocket account exhaustion as retryable so legacy fallback can recover", async () => { + const sockets: FakeWebSocket[] = []; + const synthesis = synthesizeSpeechWithOpenAiCompatible({ + config: { + apiBaseUrl: "https://example.com/v1", + apiKey: "secret", + model: "tts-1", + }, + text: "hello", + voice: "default", + createWebSocket: (url, init) => { + const socket = new FakeWebSocket(url, init); + sockets.push(socket); + return socket; + }, + fetchImpl: async () => new Response("page not found", { status: 404 }), + }); + await new Promise((resolve) => setTimeout(resolve, 0)); + + sockets[0]?.emit("open"); + sockets[0]?.emit("close", { + code: 1013, + reason: "no available account", + }); + + await expect(synthesis).rejects.toMatchObject({ + code: "EXTERNAL_TTS_UPSTREAM", + message: "External TTS websocket request failed: no available account", + retryable: true, + }); + }); +}); diff --git a/apps/web/src/lib/tts/openai-compatible.ts b/apps/web/src/lib/tts/openai-compatible.ts new file mode 100644 index 0000000..270c03f --- /dev/null +++ b/apps/web/src/lib/tts/openai-compatible.ts @@ -0,0 +1,698 @@ +import { z } from "zod"; +import { + DEFAULT_EXTERNAL_TTS_VOICE, + DEFAULT_VOICE_PACK, +} from "@/constants/tts-constants"; +import { fetchWithTimeout, type FetchLike } from "./fetch-with-timeout"; +import { TtsError } from "./errors"; + +const externalTtsConfigSchema = z.object({ + API_BASE_URL: z.string().min(1), + API_MODEL: z.string().min(1), + API_KEY: z.string().min(1), +}); +const EXTERNAL_TTS_TIMEOUT_MS = 15_000; +const EXTERNAL_TTS_RESPONSES_AUDIO_FORMAT = "mp3"; + +export { DEFAULT_EXTERNAL_TTS_VOICE }; + +export interface ExternalTtsConfig { + apiBaseUrl: string; + apiKey: string; + model: string; +} + +interface ExternalTtsWebSocketMessageEvent { + data: unknown; +} + +interface ExternalTtsWebSocketErrorEvent { + message?: string; + type?: string; +} + +interface ExternalTtsWebSocketCloseEvent { + code: number; + reason: string; +} + +export interface ExternalTtsWebSocketLike { + addEventListener( + type: "close", + listener: (event: ExternalTtsWebSocketCloseEvent) => void, + ): void; + addEventListener( + type: "error", + listener: (event: ExternalTtsWebSocketErrorEvent) => void, + ): void; + addEventListener( + type: "message", + listener: (event: ExternalTtsWebSocketMessageEvent) => void, + ): void; + addEventListener(type: "open", listener: () => void): void; + close(code?: number, reason?: string): void; + removeEventListener?( + type: "close", + listener: (event: ExternalTtsWebSocketCloseEvent) => void, + ): void; + removeEventListener?( + type: "error", + listener: (event: ExternalTtsWebSocketErrorEvent) => void, + ): void; + removeEventListener?( + type: "message", + listener: (event: ExternalTtsWebSocketMessageEvent) => void, + ): void; + removeEventListener?(type: "open", listener: () => void): void; + send(data: string): void; +} + +export type ExternalTtsWebSocketFactory = ( + url: string, + init?: { + headers?: Record; + }, +) => ExternalTtsWebSocketLike; + +function resolveExternalTtsEnv({ + env, +}: { + env: Record; +}): Record<"API_BASE_URL" | "API_MODEL" | "API_KEY", string | undefined> { + return { + API_BASE_URL: env.EXTERNAL_TTS_API_BASE_URL ?? env.API_BASE_URL, + API_MODEL: env.EXTERNAL_TTS_API_MODEL ?? env.API_MODEL, + API_KEY: env.EXTERNAL_TTS_API_KEY ?? env.API_KEY, + }; +} + +function isRetryableStatus(status: number | undefined): boolean { + if (status == null) { + return true; + } + + return status === 408 || status === 429 || status >= 500; +} + +function wrapExternalUpstreamError({ error }: { error: unknown }): TtsError { + if (error instanceof TtsError) { + return error; + } + + return new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: + error instanceof Error ? error.message : "External TTS request failed", + retryable: true, + }); +} + +export function getExternalTtsConfig({ + env, +}: { + env: Record; +}): ExternalTtsConfig { + const parsed = externalTtsConfigSchema.safeParse( + resolveExternalTtsEnv({ env }), + ); + + if (!parsed.success) { + throw new TtsError({ + code: "EXTERNAL_TTS_CONFIG", + message: "External TTS is not configured", + }); + } + + const apiBaseUrl = parsed.data.API_BASE_URL.trim().replace(/\/+$/, ""); + const apiKey = parsed.data.API_KEY.trim(); + const model = parsed.data.API_MODEL.trim(); + + if (!apiBaseUrl || !apiKey || !model) { + throw new TtsError({ + code: "EXTERNAL_TTS_CONFIG", + message: "External TTS is not configured", + }); + } + + try { + const url = new URL(apiBaseUrl); + + if (url.protocol !== "http:" && url.protocol !== "https:") { + throw new Error("Unsupported protocol"); + } + } catch { + throw new TtsError({ + code: "EXTERNAL_TTS_CONFIG", + message: "External TTS is not configured", + }); + } + + return { + apiBaseUrl, + apiKey, + model, + }; +} + +function resolveVoice({ voice }: { voice?: string }): string { + if (!voice || voice === DEFAULT_VOICE_PACK) { + return DEFAULT_EXTERNAL_TTS_VOICE; + } + + return voice; +} + +async function getUpstreamErrorMessage({ + response, +}: { + response: Response; +}): Promise { + const contentType = response.headers.get("content-type") ?? ""; + const text = await response.text().catch(() => ""); + + if (contentType.includes("application/json")) { + const json = (() => { + try { + return JSON.parse(text) as { + error?: + | string + | { + message?: string; + }; + } | null; + } catch { + return null; + } + })(); + + if (typeof json?.error === "string" && json.error.trim()) { + return json.error; + } + + if ( + typeof json?.error === "object" && + typeof json.error?.message === "string" && + json.error.message.trim() + ) { + return json.error.message; + } + } + + if (text.trim()) { + return text; + } + + return String(response.status); +} + +function getSpeechEndpointUrls({ + apiBaseUrl, +}: { + apiBaseUrl: string; +}): string[] { + const normalizedBaseUrl = apiBaseUrl.replace(/\/+$/, ""); + const baseWithoutV1 = normalizedBaseUrl.endsWith("/v1") + ? normalizedBaseUrl.slice(0, -3) + : normalizedBaseUrl; + const baseWithV1 = normalizedBaseUrl.endsWith("/v1") + ? normalizedBaseUrl + : `${normalizedBaseUrl}/v1`; + const urls = [`${baseWithV1}/audio/speech`, `${baseWithoutV1}/audio/speech`]; + + return [...new Set(urls)]; +} + +function getResponsesEndpointUrls({ + apiBaseUrl, +}: { + apiBaseUrl: string; +}): string[] { + const normalizedBaseUrl = apiBaseUrl.replace(/\/+$/, ""); + const baseWithoutV1 = normalizedBaseUrl.endsWith("/v1") + ? normalizedBaseUrl.slice(0, -3) + : normalizedBaseUrl; + const baseWithV1 = normalizedBaseUrl.endsWith("/v1") + ? normalizedBaseUrl + : `${normalizedBaseUrl}/v1`; + const urls = [`${baseWithV1}/responses`, `${baseWithoutV1}/responses`]; + + return [...new Set(urls)]; +} + +function toWebSocketUrl({ url }: { url: string }): string { + const parsed = new URL(url); + parsed.protocol = parsed.protocol === "https:" ? "wss:" : "ws:"; + return parsed.toString(); +} + +function isAudioContentType({ contentType }: { contentType: string }): boolean { + const mimeType = contentType.split(";")[0]?.trim().toLowerCase() ?? ""; + + return ( + mimeType.startsWith("audio/") || mimeType === "application/octet-stream" + ); +} + +function shouldTryResponsesWebSocket({ + response, +}: { + response: Response; +}): boolean { + if ( + response.status === 404 || + response.status === 405 || + response.status === 426 + ) { + return true; + } + + if (!response.ok) { + return false; + } + + const contentType = response.headers.get("content-type") ?? ""; + const mimeType = contentType.split(";")[0]?.trim().toLowerCase() ?? ""; + + return mimeType === "text/html"; +} + +async function cancelResponseBody({ + response, +}: { + response: Response; +}): Promise { + try { + await response.body?.cancel(); + } catch { + // Best-effort cleanup only. + } +} + +function getResponsesWebSocketCloseRetryable({ + code, + reason, +}: { + code: number; + reason: string; +}): boolean { + const normalizedReason = reason.trim().toLowerCase(); + + if ( + normalizedReason.includes("required") || + normalizedReason.includes("unsupported") + ) { + return false; + } + + return code === 1006 || code === 1011 || code === 1012 || code === 1013; +} + +function getResponsesWebSocketError({ + code, + reason, +}: { + code: number; + reason: string; +}): TtsError { + return new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: `External TTS websocket request failed: ${ + reason || `WebSocket closed (${code})` + }`, + retryable: getResponsesWebSocketCloseRetryable({ code, reason }), + }); +} + +function getResponseEventErrorMessage({ + event, +}: { + event: Record; +}): string | null { + if (typeof event.message === "string" && event.message.trim()) { + return event.message; + } + + if ( + typeof event.error === "object" && + event.error !== null && + "message" in event.error && + typeof event.error.message === "string" && + event.error.message.trim() + ) { + return event.error.message; + } + + return null; +} + +function createExternalTtsWebSocket( + url: string, + init?: { headers?: Record }, +): ExternalTtsWebSocketLike { + type NodeCompatibleWebSocket = new ( + url: string, + init?: { headers?: Record }, + ) => ExternalTtsWebSocketLike; + + const WebSocketCtor = + globalThis.WebSocket as unknown as NodeCompatibleWebSocket; + + return new WebSocketCtor(url, init); +} + +async function synthesizeSpeechWithResponsesWebSocket({ + config, + createWebSocket = createExternalTtsWebSocket, + text, + voice, +}: { + config: ExternalTtsConfig; + createWebSocket?: ExternalTtsWebSocketFactory; + text: string; + voice?: string; +}): Promise { + const endpointUrl = toWebSocketUrl({ + url: + getResponsesEndpointUrls({ apiBaseUrl: config.apiBaseUrl })[0] ?? + `${config.apiBaseUrl.replace(/\/+$/, "")}/responses`, + }); + const audioChunks: Uint8Array[] = []; + + return await new Promise((resolve, reject) => { + const socket = createWebSocket(endpointUrl, { + headers: { + Authorization: `Bearer ${config.apiKey}`, + }, + }); + let settled = false; + + const cleanup = () => { + socket.removeEventListener?.("close", handleClose); + socket.removeEventListener?.("error", handleError); + socket.removeEventListener?.("message", handleMessage); + socket.removeEventListener?.("open", handleOpen); + }; + + const finish = ({ + error, + value, + }: { + error?: TtsError; + value?: ArrayBuffer; + }) => { + if (settled) { + return; + } + + settled = true; + cleanup(); + + try { + socket.close(); + } catch { + // Best effort cleanup only. + } + + if (error) { + reject(error); + return; + } + + resolve(value ?? new ArrayBuffer(0)); + }; + + const handleOpen = () => { + try { + socket.send( + JSON.stringify({ + audio: { + format: EXTERNAL_TTS_RESPONSES_AUDIO_FORMAT, + }, + input: text, + model: config.model, + output_modalities: ["audio"], + response: { + instructions: text, + modalities: ["audio"], + output_audio_format: EXTERNAL_TTS_RESPONSES_AUDIO_FORMAT, + voice: resolveVoice({ voice }), + }, + type: "response.create", + }), + ); + } catch (error) { + finish({ + error: wrapExternalUpstreamError({ error }), + }); + } + }; + + const handleMessage = async ({ + data, + }: ExternalTtsWebSocketMessageEvent) => { + try { + if (data instanceof Blob) { + audioChunks.push(new Uint8Array(await data.arrayBuffer())); + return; + } + + if (data instanceof ArrayBuffer) { + audioChunks.push(new Uint8Array(data)); + return; + } + + if (ArrayBuffer.isView(data)) { + audioChunks.push( + new Uint8Array( + data.buffer.slice( + data.byteOffset, + data.byteOffset + data.byteLength, + ), + ), + ); + return; + } + + if (typeof data !== "string") { + return; + } + + const event = JSON.parse(data) as Record; + const type = typeof event.type === "string" ? event.type : ""; + + if ( + type === "response.audio.delta" || + type === "response.output_audio.delta" + ) { + if (typeof event.delta === "string" && event.delta.length > 0) { + audioChunks.push( + Uint8Array.from(Buffer.from(event.delta, "base64")), + ); + } + return; + } + + if (type === "response.completed" || type === "response.done") { + const audio = Buffer.concat( + audioChunks.map((chunk) => Buffer.from(chunk)), + ); + + if (audio.byteLength === 0) { + finish({ + error: new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: "External TTS returned empty audio", + retryable: false, + }), + }); + return; + } + + finish({ + value: audio.buffer.slice( + audio.byteOffset, + audio.byteOffset + audio.byteLength, + ), + }); + return; + } + + if ( + type === "error" || + type === "response.error" || + type === "response.failed" || + type === "response.incomplete" + ) { + finish({ + error: new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: + getResponseEventErrorMessage({ event }) ?? + "External TTS websocket request failed", + retryable: false, + }), + }); + } + } catch (error) { + finish({ + error: wrapExternalUpstreamError({ error }), + }); + } + }; + + const handleError = (event: ExternalTtsWebSocketErrorEvent) => { + finish({ + error: new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: + event.message?.trim() || "External TTS websocket request failed", + retryable: true, + }), + }); + }; + + const handleClose = ({ code, reason }: ExternalTtsWebSocketCloseEvent) => { + if (settled) { + return; + } + + finish({ + error: getResponsesWebSocketError({ code, reason }), + }); + }; + + socket.addEventListener("open", handleOpen); + socket.addEventListener("message", handleMessage); + socket.addEventListener("error", handleError); + socket.addEventListener("close", handleClose); + }); +} + +export async function synthesizeSpeechWithOpenAiCompatible({ + config, + createWebSocket = createExternalTtsWebSocket, + text, + voice, + fetchImpl = fetch, + timeoutMs = EXTERNAL_TTS_TIMEOUT_MS, +}: { + config: ExternalTtsConfig; + createWebSocket?: ExternalTtsWebSocketFactory; + text: string; + voice?: string; + fetchImpl?: FetchLike; + timeoutMs?: number; +}): Promise { + const endpointUrls = getSpeechEndpointUrls({ + apiBaseUrl: config.apiBaseUrl, + }); + const requestInit = { + method: "POST", + headers: { + Authorization: `Bearer ${config.apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + input: text, + model: config.model, + response_format: "mp3", + voice: resolveVoice({ voice }), + }), + } satisfies RequestInit; + + let lastErrorResponse: Response | null = null; + + for (const endpointUrl of endpointUrls) { + let response: Response; + + try { + response = await fetchWithTimeout({ + fetchImpl, + init: requestInit, + input: endpointUrl, + timeoutMessage: "External TTS request timed out", + timeoutMs, + }); + } catch (error) { + throw wrapExternalUpstreamError({ error }); + } + + if (response.ok) { + const contentType = response.headers.get("content-type") ?? ""; + if (!isAudioContentType({ contentType })) { + if (shouldTryResponsesWebSocket({ response })) { + lastErrorResponse = response; + await cancelResponseBody({ response }); + break; + } + + await cancelResponseBody({ response }); + throw new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: `Expected audio response, received ${contentType || "(no content-type)"}`, + retryable: false, + status: response.status, + }); + } + + let audio: ArrayBuffer; + + try { + audio = await response.arrayBuffer(); + } catch (error) { + throw new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: `External TTS audio read failed: ${error instanceof Error ? error.message : "Unknown error"}`, + retryable: false, + status: response.status, + }); + } + + if (audio.byteLength === 0) { + throw new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: "External TTS returned empty audio", + retryable: false, + status: response.status, + }); + } + + return audio; + } + + lastErrorResponse = response; + + if (response.status !== 404) { + break; + } + + await cancelResponseBody({ response }); + } + + if ( + lastErrorResponse && + shouldTryResponsesWebSocket({ response: lastErrorResponse }) + ) { + return synthesizeSpeechWithResponsesWebSocket({ + config, + createWebSocket, + text, + voice, + }); + } + + if (!lastErrorResponse) { + throw new Error( + "Expected external TTS to capture an upstream response before failing", + ); + } + + throw new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: `External TTS request failed: ${await getUpstreamErrorMessage({ + response: lastErrorResponse, + })}`, + retryable: isRetryableStatus(lastErrorResponse.status), + status: lastErrorResponse.status, + }); +} diff --git a/apps/web/src/lib/tts/provider.test.ts b/apps/web/src/lib/tts/provider.test.ts new file mode 100644 index 0000000..e214fc5 --- /dev/null +++ b/apps/web/src/lib/tts/provider.test.ts @@ -0,0 +1,215 @@ +import { describe, expect, test } from "bun:test"; +import { TtsError } from "./errors"; +import { synthesizeSpeechWithFallback } from "./provider"; + +describe("synthesizeSpeechWithFallback", () => { + test("returns the configured external provider result when it succeeds", async () => { + let legacyCalled = false; + + const result = await synthesizeSpeechWithFallback({ + env: { + EXTERNAL_TTS_API_BASE_URL: "https://example.com/v1", + EXTERNAL_TTS_API_MODEL: "tts-1", + EXTERNAL_TTS_API_KEY: "secret", + }, + text: "hello", + voice: "default", + openAiSynthesize: async () => Uint8Array.from([1, 2, 3]).buffer, + legacySynthesize: async () => { + legacyCalled = true; + return Uint8Array.from([9, 9, 9]).buffer; + }, + }); + + expect(Array.from(new Uint8Array(result))).toEqual([1, 2, 3]); + expect(legacyCalled).toBe(false); + }); + + test("falls back to the legacy provider for structured external upstream errors", async () => { + let legacyCalled = false; + + const result = await synthesizeSpeechWithFallback({ + env: { + API_BASE_URL: "https://example.com/v1", + API_MODEL: "tts-1", + API_KEY: "secret", + }, + text: "hello", + voice: "default", + openAiSynthesize: async () => { + throw new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: + "External TTS request failed: Expected audio response, received text/html; charset=utf-8", + }); + }, + legacySynthesize: async ({ text }) => { + legacyCalled = true; + expect(text).toBe("hello"); + return Uint8Array.from([7, 8, 9]).buffer; + }, + }); + + expect(Array.from(new Uint8Array(result))).toEqual([7, 8, 9]); + expect(legacyCalled).toBe(true); + }); + + test("rethrows unexpected external provider errors instead of silently falling back", async () => { + let legacyCalled = false; + + await expect( + synthesizeSpeechWithFallback({ + env: { + API_BASE_URL: "https://example.com/v1", + API_MODEL: "tts-1", + API_KEY: "secret", + }, + text: "hello", + voice: "default", + openAiSynthesize: async () => { + throw new Error("unexpected provider failure"); + }, + legacySynthesize: async () => { + legacyCalled = true; + return Uint8Array.from([7, 8, 9]).buffer; + }, + }), + ).rejects.toThrow("unexpected provider failure"); + + expect(legacyCalled).toBe(false); + }); + + test("rethrows non-retryable external upstream errors instead of falling back", async () => { + let legacyCalled = false; + + await expect( + synthesizeSpeechWithFallback({ + env: { + API_BASE_URL: "https://example.com/v1", + API_MODEL: "tts-1", + API_KEY: "secret", + }, + text: "hello", + voice: "default", + openAiSynthesize: async () => { + throw new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: "External TTS request failed: invalid api key", + retryable: false, + status: 401, + }); + }, + legacySynthesize: async () => { + legacyCalled = true; + return Uint8Array.from([7, 8, 9]).buffer; + }, + }), + ).rejects.toMatchObject({ + code: "EXTERNAL_TTS_UPSTREAM", + retryable: false, + status: 401, + }); + + expect(legacyCalled).toBe(false); + }); + + test("falls back when the external provider reports no available account", async () => { + let legacyCalled = false; + + const result = await synthesizeSpeechWithFallback({ + env: { + API_BASE_URL: "https://example.com/v1", + API_MODEL: "tts-1", + API_KEY: "secret", + }, + text: "hello", + voice: "default", + openAiSynthesize: async () => { + throw new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: + "External TTS websocket request failed: no available account", + retryable: true, + }); + }, + legacySynthesize: async () => { + legacyCalled = true; + return Uint8Array.from([7, 8, 9]).buffer; + }, + }); + + expect(Array.from(new Uint8Array(result))).toEqual([7, 8, 9]); + expect(legacyCalled).toBe(true); + }); + + test("does not fall back when the external provider returns a non-retryable websocket account error", async () => { + let legacyCalled = false; + + await expect( + synthesizeSpeechWithFallback({ + env: { + API_BASE_URL: "https://example.com/v1", + API_MODEL: "tts-1", + API_KEY: "secret", + }, + text: "hello", + voice: "default", + openAiSynthesize: async () => { + throw new TtsError({ + code: "EXTERNAL_TTS_UPSTREAM", + message: + "External TTS websocket request failed: no available account", + retryable: false, + }); + }, + legacySynthesize: async () => { + legacyCalled = true; + return Uint8Array.from([7, 8, 9]).buffer; + }, + }), + ).rejects.toMatchObject({ + code: "EXTERNAL_TTS_UPSTREAM", + message: "External TTS websocket request failed: no available account", + retryable: false, + }); + + expect(legacyCalled).toBe(false); + }); + + test("rethrows missing external config instead of silently falling back", async () => { + let openAiCalled = false; + let legacyCalled = false; + + await expect( + synthesizeSpeechWithFallback({ + env: {}, + text: "hello", + voice: "default", + openAiSynthesize: async () => { + openAiCalled = true; + return Uint8Array.from([1]).buffer; + }, + legacySynthesize: async () => { + legacyCalled = true; + return Uint8Array.from([9]).buffer; + }, + }), + ).rejects.toThrow("External TTS is not configured"); + + expect(openAiCalled).toBe(false); + expect(legacyCalled).toBe(false); + }); + + test("rethrows missing external config with a structured error code", async () => { + await expect( + synthesizeSpeechWithFallback({ + env: {}, + text: "hello", + voice: "default", + }), + ).rejects.toMatchObject({ + code: "EXTERNAL_TTS_CONFIG", + message: "External TTS is not configured", + }); + }); +}); diff --git a/apps/web/src/lib/tts/provider.ts b/apps/web/src/lib/tts/provider.ts new file mode 100644 index 0000000..39d1c37 --- /dev/null +++ b/apps/web/src/lib/tts/provider.ts @@ -0,0 +1,48 @@ +import { + getExternalTtsConfig, + synthesizeSpeechWithOpenAiCompatible, +} from "./openai-compatible"; +import { isTtsError } from "./errors"; +import { synthesizeSpeechWithLegacyProvider } from "./legacy"; + +type TtsEnv = { + API_BASE_URL?: string; + API_MODEL?: string; + API_KEY?: string; + EXTERNAL_TTS_API_BASE_URL?: string; + EXTERNAL_TTS_API_MODEL?: string; + EXTERNAL_TTS_API_KEY?: string; +}; + +export async function synthesizeSpeechWithFallback({ + env, + text, + voice, + openAiSynthesize = synthesizeSpeechWithOpenAiCompatible, + legacySynthesize = synthesizeSpeechWithLegacyProvider, +}: { + env: TtsEnv; + text: string; + voice?: string; + openAiSynthesize?: typeof synthesizeSpeechWithOpenAiCompatible; + legacySynthesize?: typeof synthesizeSpeechWithLegacyProvider; +}): Promise { + try { + const config = getExternalTtsConfig({ env }); + return await openAiSynthesize({ config, text, voice }); + } catch (error) { + if (isTtsError(error) && error.code === "EXTERNAL_TTS_CONFIG") { + throw error; + } + + if ( + !isTtsError(error) || + error.code !== "EXTERNAL_TTS_UPSTREAM" || + error.retryable === false + ) { + throw error; + } + + return legacySynthesize({ text, voice }); + } +} diff --git a/apps/web/src/lib/tts/service.test.ts b/apps/web/src/lib/tts/service.test.ts new file mode 100644 index 0000000..86624c2 --- /dev/null +++ b/apps/web/src/lib/tts/service.test.ts @@ -0,0 +1,220 @@ +import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test"; +import type { EditorCore } from "@/core"; +import type { AudioTrack } from "@/types/timeline"; +import { generateAndInsertSpeech, generateSpeechFromText } from "./service"; + +const originalFetch = globalThis.fetch; +const originalAudioContext = globalThis.AudioContext; +const originalCreateObjectURL = URL.createObjectURL; + +describe("tts service", () => { + let decodedBytes: number[] | null; + let fakeBuffer: AudioBuffer; + + beforeEach(() => { + decodedBytes = null; + fakeBuffer = { duration: 2.5 } as AudioBuffer; + + Object.defineProperty(globalThis, "AudioContext", { + configurable: true, + value: class FakeAudioContext { + async decodeAudioData(arrayBuffer: ArrayBuffer) { + decodedBytes = Array.from(new Uint8Array(arrayBuffer)); + return fakeBuffer; + } + }, + }); + URL.createObjectURL = mock(() => "blob:tts-preview"); + }); + + afterEach(() => { + globalThis.fetch = originalFetch; + Object.defineProperty(globalThis, "AudioContext", { + configurable: true, + value: originalAudioContext, + }); + URL.createObjectURL = originalCreateObjectURL; + }); + + test("generateSpeechFromText decodes base64 audio returned by the route", async () => { + const fetchCalls: Array<[RequestInfo | URL, RequestInit | undefined]> = []; + globalThis.fetch = (async (input, init) => { + fetchCalls.push([input, init]); + return Response.json({ audio: "AQID" }); + }) as typeof fetch; + + const result = await generateSpeechFromText({ + text: "hello", + voice: "nova", + }); + + expect(fetchCalls).toHaveLength(1); + expect(fetchCalls[0]?.[0]).toBe("/api/tts/generate"); + expect(fetchCalls[0]?.[1]).toMatchObject({ + method: "POST", + headers: { "Content-Type": "application/json" }, + }); + expect(JSON.parse(String(fetchCalls[0]?.[1]?.body))).toEqual({ + text: "hello", + voice: "nova", + }); + expect(decodedBytes).toEqual([1, 2, 3]); + expect(result.duration).toBe(2.5); + expect(result.buffer).toBe(fakeBuffer); + expect(result.blob.type).toBe("audio/mpeg"); + expect(Array.from(new Uint8Array(await result.blob.arrayBuffer()))).toEqual( + [1, 2, 3], + ); + }); + + test("generateAndInsertSpeech uploads generated audio and inserts it into an existing audio track", async () => { + globalThis.fetch = (async () => + Response.json({ audio: "AQID" })) as unknown as typeof fetch; + + const tracks: AudioTrack[] = [ + { + id: "audio-track-1", + name: "Audio 1", + type: "audio", + muted: false, + elements: [], + }, + ]; + const addMediaAssetCalls: unknown[] = []; + const addMediaAssetMock = async (args: unknown) => { + addMediaAssetCalls.push(args); + return "media-1"; + }; + let addTrackCallCount = 0; + const addTrackMock = () => { + addTrackCallCount++; + throw new Error("addTrack should not be called"); + }; + const insertElementCalls: unknown[] = []; + const insertElementMock = (args: unknown) => { + insertElementCalls.push(args); + }; + + const editor = { + media: { + addMediaAsset: addMediaAssetMock, + }, + project: { + getActive: () => ({ + metadata: { id: "project-1" }, + }), + }, + timeline: { + getTracks: () => tracks, + addTrack: addTrackMock, + insertElement: insertElementMock, + }, + } as unknown as EditorCore; + + const result = await generateAndInsertSpeech({ + editor, + text: "hello world", + startTime: 3, + voice: "default", + }); + + expect(result).toEqual({ duration: 2.5 }); + expect(addMediaAssetCalls).toHaveLength(1); + expect(addMediaAssetCalls[0]).toMatchObject({ + projectId: "project-1", + asset: { + name: "TTS: hello world", + type: "audio", + url: "blob:tts-preview", + duration: 2.5, + ephemeral: true, + }, + }); + expect(insertElementCalls).toHaveLength(1); + expect(insertElementCalls[0]).toMatchObject({ + placement: { + mode: "explicit", + trackId: "audio-track-1", + }, + element: { + type: "audio", + sourceType: "upload", + mediaId: "media-1", + name: "TTS: hello world", + duration: 2.5, + startTime: 3, + buffer: fakeBuffer, + }, + }); + expect(addTrackCallCount).toBe(0); + }); + + test("generateAndInsertSpeech creates a new audio track when existing ones overlap", async () => { + globalThis.fetch = (async () => + Response.json({ audio: "AQID" })) as unknown as typeof fetch; + + const tracks: AudioTrack[] = [ + { + id: "audio-track-1", + name: "Audio 1", + type: "audio", + muted: false, + elements: [ + { + id: "audio-el-1", + type: "audio", + sourceType: "upload", + mediaId: "existing-media", + name: "Existing audio", + duration: 10, + startTime: 0, + trimStart: 0, + trimEnd: 0, + volume: 1, + muted: false, + }, + ], + }, + ]; + const addMediaAssetMock = async () => "media-2"; + const addTrackCalls: unknown[] = []; + const addTrackMock = (args: unknown) => { + addTrackCalls.push(args); + return "audio-track-2"; + }; + const insertElementCalls: unknown[] = []; + const insertElementMock = (args: unknown) => { + insertElementCalls.push(args); + }; + + const editor = { + media: { + addMediaAsset: addMediaAssetMock, + }, + project: { + getActive: () => ({ + metadata: { id: "project-1" }, + }), + }, + timeline: { + getTracks: () => tracks, + addTrack: addTrackMock, + insertElement: insertElementMock, + }, + } as unknown as EditorCore; + + await generateAndInsertSpeech({ + editor, + text: "overlap check", + startTime: 2, + }); + + expect(addTrackCalls).toEqual([{ type: "audio" }]); + expect(insertElementCalls[0]).toMatchObject({ + placement: { + mode: "explicit", + trackId: "audio-track-2", + }, + }); + }); +}); diff --git a/docs/plans/2026-03-17-tts-external-provider-design.md b/docs/plans/2026-03-17-tts-external-provider-design.md new file mode 100644 index 0000000..c2bf239 --- /dev/null +++ b/docs/plans/2026-03-17-tts-external-provider-design.md @@ -0,0 +1,133 @@ +# 外部 TTS 扩展设计 + +## 背景 + +`TIA-51` 要求 Cutia 支持调用外部 TTS API,把文本或对话内容生成语音并接入视频编辑流程。 + +改造前,仓库已经有一条从文本元素生成语音并插入时间线的链路,但服务端路由 `apps/web/src/app/api/tts/generate/route.ts` 在上游扩展能力和 `voice` 语义上都存在限制。这意味着: + +- 外部 TTS 提供方无法通过环境配置切换 +- `voice` 参数虽然已经沿链路透传,但缺少与外部 provider 对齐的清晰适配语义 +- 错误语义受限于硬编码上游,缺少可维护的适配层 + +后续设计会在保留既有编辑器接入方式的前提下,把 provider 配置、fallback 与错误适配层补齐为可维护结构。 + +## 目标 + +- 通过环境变量配置外部 TTS 提供方 +- 支持以文本内容调用外部 TTS API 并返回可插入编辑器的音频 +- 让现有 `voice` 参数真正参与外部请求 +- 为失败场景提供明确、可测试的错误返回 + +## 非目标 + +- 不在本次中引入完整的多供应商设置 UI +- 不改动现有时间线插入和媒体入库的主流程 +- 不为每个第三方 TTS 服务单独做适配器注册中心 + +## 方案比较 + +### 方案 A: 保留当前硬编码上游,只补更多参数 + +优点: +- 改动最少 + +缺点: +- 仍然不满足“外部 TTS 能力扩展”的核心要求 +- 供应商不可配置 +- 无法安全复用运行环境已提供的 API 配置 + +结论:不采用。 + +### 方案 B: 改为环境驱动的 OpenAI 兼容 TTS 适配层 + +优点: +- 只需要一层薄适配,即可支持大量 OpenAI 兼容的 TTS 服务 +- 优先使用 `EXTERNAL_TTS_API_BASE_URL`、`EXTERNAL_TTS_API_MODEL`、`EXTERNAL_TTS_API_KEY` +- 兼容读取当前运行环境里的 `API_BASE_URL`、`API_MODEL`、`API_KEY` 作为迁移别名 +- 前端接口保持不变,编辑器链路改动最小 + +缺点: +- 需要定义默认 voice 映射 +- 需要自己处理二进制音频响应与错误解析 + +结论:采用。 + +### 方案 C: 直接搭建供应商注册中心 + +优点: +- 长期扩展性最好 + +缺点: +- 对当前工单明显过度设计 +- 需要更多配置、UI 和测试面 + +结论:当前不采用。 + +## 决策 + +采用方案 B:新增一个面向 OpenAI 兼容接口的 TTS 适配层,服务端路由只负责参数校验、调用适配器并把音频转为前端可消费的 base64。 + +## 架构 + +### 服务端 + +- 在 `apps/web/src/lib/tts/` 下新增可测试的适配模块 +- 模块职责: + - 读取并规范化 TTS 配置 + - 把 `text`/`voice` 转换为上游 `/audio/speech` 请求 + - 解析上游失败响应,输出明确错误 + - 返回音频 `ArrayBuffer` +- 路由只保留: + - 请求体验证 + - 调用适配模块 + - 转 base64 返回 `{ audio }` + +### 前端 + +- `apps/web/src/lib/tts/service.ts` 保持调用 `/api/tts/generate` 的协议不变 +- `apps/web/src/constants/tts-constants.ts` 提供可实际使用的 voice 列表与默认值 +- 文本面板和动作系统继续复用既有插入媒体/时间线逻辑 + +## 数据流 + +1. 用户在文本属性面板或动作系统触发 TTS +2. 前端向 `/api/tts/generate` 提交 `{ text, voice }` +3. 服务端校验参数 +4. 服务端使用环境变量构造对外部 TTS API 的请求 +5. 外部 TTS 返回音频二进制 +6. 服务端转为 base64 JSON 响应 +7. 前端解码为 `Blob` 与 `AudioBuffer` +8. 编辑器把音频加入媒体库并插入音轨 + +## 错误处理 + +- 请求参数非法:返回 `400` +- TTS 环境变量缺失:返回 `500`,信息明确为未配置 +- 外部 provider 返回可重试的 `EXTERNAL_TTS_UPSTREAM`(例如超时、`429`、`5xx`,或 websocket `no available account` 这类账号容量耗尽)时,会先回退到 legacy provider + - legacy 回退成功:最终仍可能返回 `200` + - legacy 回退失败:最终返回 `502` +- 外部 provider 返回不可重试的 `EXTERNAL_TTS_UPSTREAM`(例如 `401`/`403`/`404`、空音频、非音频响应)时:直接返回 `502` +- 未知异常:返回 `500` + +## 测试策略 + +### 自动化测试 + +- 为适配层写纯函数测试,覆盖: + - `default` voice 映射 + - 请求 URL、headers、body 是否正确 + - 外部错误 JSON / 文本响应映射 + - 成功时返回音频数据 + +### 真实验证 + +- 使用环境中的真实 TTS 配置验证,优先为 `EXTERNAL_TTS_API_*`,没有时回退到 `API_*` +- 直接运行一次服务端适配逻辑,验证能拿到非空 MP3 数据 + +## 风险与缓解 + +- 外部服务不完全兼容 OpenAI TTS 协议 + - 缓解:把适配逻辑集中在单模块,后续改协议只动一处 +- 默认 voice 与实际模型不匹配 + - 缓解:统一在适配层做默认映射,避免前端散落判断 diff --git a/docs/plans/2026-03-17-tts-external-provider.md b/docs/plans/2026-03-17-tts-external-provider.md new file mode 100644 index 0000000..8b32e0f --- /dev/null +++ b/docs/plans/2026-03-17-tts-external-provider.md @@ -0,0 +1,146 @@ +# 外部 TTS 扩展 Implementation Plan + +**Goal:** 让 Cutia 的 TTS 能力从硬编码单一路由改为可配置的外部 TTS API 调用,并继续把生成语音接入媒体库和时间线。 + +**Architecture:** 在 `apps/web/src/lib/tts/` 新增可测试的 OpenAI 兼容 TTS 适配层,`/api/tts/generate` 只负责校验和响应转换,前端调用协议保持 `{ audio }` 不变。通过 `packages/env` 暴露配置,避免把供应商细节散落到 UI 和编辑器逻辑里。 + +**Tech Stack:** Next.js route handlers, TypeScript, Zod, Bun test, OpenAI-compatible HTTP API + +--- + +## Task 1: 补环境与 voice 常量基线 + +**Files:** +- Modify: `packages/env/src/web.ts` +- Modify: `apps/web/src/constants/tts-constants.ts` + +**Step 1: 写出目标测试用例草案** + +- 目标行为: + - TTS 配置可从环境读取 + - `default` voice 会映射到可用的默认外部 voice + +**Step 2: 运行当前目标测试确认缺失** + +Run: `bun test apps/web/src/lib/tts/openai-compatible.test.ts` +Expected: FAIL,原因是测试文件或实现不存在。 + +**Step 3: 为后续实现准备最小配置面** + +- 在环境 schema 中加入 `EXTERNAL_TTS_API_BASE_URL`、`EXTERNAL_TTS_API_MODEL`、`EXTERNAL_TTS_API_KEY` +- 兼容读取旧的 `API_BASE_URL`、`API_MODEL`、`API_KEY`,用于迁移与当前共享环境 +- 在 TTS 常量中定义默认 voice 与可选 voice 列表 + +**Step 4: 运行定向测试** + +Run: `bun test apps/web/src/lib/tts/openai-compatible.test.ts` +Expected: 仍然失败,但失败点缩小到适配实现缺失。 + +**Step 5: Commit** + +```bash +git add packages/env/src/web.ts apps/web/src/constants/tts-constants.ts +git commit -m "feat: prepare external tts config" +``` + +## Task 2: 先写失败测试覆盖外部 TTS 适配层 + +**Files:** +- Create: `apps/web/src/lib/tts/openai-compatible.test.ts` +- Create: `apps/web/src/lib/tts/openai-compatible.ts` + +**Step 1: 写失败测试** + +- 成功场景:正确构造 `/audio/speech` 请求并返回音频 +- 失败场景:上游 JSON 错误、文本错误、空配置错误 +- voice 场景:`default` 被映射为默认 voice + +**Step 2: 运行测试验证失败** + +Run: `bun test apps/web/src/lib/tts/openai-compatible.test.ts` +Expected: FAIL,且失败原因为导入缺失或行为不匹配,不是测试写错。 + +**Step 3: 写最小实现** + +- 提供配置解析 +- 提供请求构造 +- 提供错误解析 +- 提供音频数组缓冲区返回 + +**Step 4: 运行测试确认通过** + +Run: `bun test apps/web/src/lib/tts/openai-compatible.test.ts` +Expected: PASS + +**Step 5: Commit** + +```bash +git add apps/web/src/lib/tts/openai-compatible.ts apps/web/src/lib/tts/openai-compatible.test.ts +git commit -m "feat: add external tts adapter" +``` + +## Task 3: 接回 API 路由 + +**Files:** +- Modify: `apps/web/src/app/api/tts/generate/route.ts` + +**Step 1: 写失败测试预期** + +- 通过 Task 2 已确保适配层正确 +- 当前路由仍硬编码旧上游,因此与新适配层设计不一致 + +**Step 2: 运行现有测试基线** + +Run: `bun test apps/web/src/lib/tts/openai-compatible.test.ts` +Expected: PASS + +**Step 3: 最小改造路由** + +- 删除硬编码上游 URL 和旧返回结构解析 +- 保留 Zod 请求校验 +- 调用适配层并统一转换为 `{ audio }` + +**Step 4: 运行相关测试** + +Run: `bun test apps/web/src/lib/tts/openai-compatible.test.ts` +Expected: PASS + +**Step 5: Commit** + +```bash +git add apps/web/src/app/api/tts/generate/route.ts +git commit -m "feat: wire route to external tts provider" +``` + +## Task 4: 端到端验证与整理 + +**Files:** +- Modify: `docs/plans/2026-03-17-tts-external-provider-design.md` +- Modify: `docs/plans/2026-03-17-tts-external-provider.md` + +**Step 1: 运行自动化测试** + +Run: `pnpm --filter @cutia/web test -- apps/web/src/lib/tts/openai-compatible.test.ts` +Expected: PASS + +**Step 2: 运行真实外部 TTS 验证** + +Run: `bun --eval 'import { getExternalTtsConfig, synthesizeSpeechWithOpenAiCompatible } from "./apps/web/src/lib/tts/openai-compatible.ts"; const config = getExternalTtsConfig({ env: process.env }); const audio = await synthesizeSpeechWithOpenAiCompatible({ config, text: "Cutia TTS probe", voice: "default" }); console.log(audio.byteLength);'` +Expected: 输出非空音频字节长度,不打印密钥。 + +**Step 3: 检查格式与类型** + +Run: `pnpm --filter @cutia/web lint` +Expected: PASS + +**Step 4: 整理工作台与提交内容** + +- 更新 Linear 工作台中的验收、验证和备注 +- 推送分支并创建 PR + +**Step 5: Commit** + +```bash +git add docs/plans/2026-03-17-tts-external-provider-design.md docs/plans/2026-03-17-tts-external-provider.md +git commit -m "docs: capture external tts plan" +``` diff --git a/packages/env/src/web.ts b/packages/env/src/web.ts index 4ec86fe..c40efc5 100644 --- a/packages/env/src/web.ts +++ b/packages/env/src/web.ts @@ -21,6 +21,12 @@ const webEnvSchema = z.object({ UPSTASH_REDIS_REST_TOKEN: z.string(), FREESOUND_CLIENT_ID: z.string().optional(), FREESOUND_API_KEY: z.string().optional(), + EXTERNAL_TTS_API_BASE_URL: z.string().optional(), + EXTERNAL_TTS_API_MODEL: z.string().optional(), + EXTERNAL_TTS_API_KEY: z.string().optional(), + API_BASE_URL: z.string().optional(), + API_MODEL: z.string().optional(), + API_KEY: z.string().optional(), // Cloudflare R2 R2_ACCOUNT_ID: z.string().optional(),