diff --git a/.changeset/thin-worlds-tie.md b/.changeset/thin-worlds-tie.md new file mode 100644 index 00000000000..5506452e807 --- /dev/null +++ b/.changeset/thin-worlds-tie.md @@ -0,0 +1,6 @@ +--- +'@mastra/speech-openai': minor +'@mastra/voice-openai': minor +--- + +Deprecate @mastra/speech-openai for @mastra/voice-openai diff --git a/.gitignore b/.gitignore index 597ee2532c9..a0d5f6dec1a 100644 --- a/.gitignore +++ b/.gitignore @@ -22,4 +22,7 @@ openapi-ts-error* .secrets # Local Netlify folder .netlify -.npmrc \ No newline at end of file +.npmrc + +# Test output directories +voice/**/test-output*/ diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 9b9e4822dc1..44fff26549b 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -2076,7 +2076,7 @@ importers: version: 7.50.0(@types/node@22.13.4) '@rollup/plugin-image': specifier: ^3.0.3 - version: 3.0.3(rollup@4.34.8) + version: 3.0.3(rollup@3.29.5) '@size-limit/preset-small-lib': specifier: ^11.1.4 version: 11.2.0(size-limit@11.2.0) @@ -2137,7 +2137,7 @@ importers: version: 7.50.0(@types/node@22.13.4) '@rollup/plugin-image': specifier: ^3.0.3 - version: 3.0.3(rollup@3.29.5) + version: 3.0.3(rollup@4.34.8) '@size-limit/preset-small-lib': specifier: ^11.1.4 version: 11.2.0(size-limit@11.2.0) @@ -3452,9 +3452,6 @@ importers: typescript: specifier: ^5.7.3 version: 5.7.3 - vitest: - specifier: ^2.1.8 - version: 2.1.9(@edge-runtime/vm@3.2.0)(@types/node@22.13.4)(jsdom@20.0.3(bufferutil@4.0.9)(canvas@2.11.2(encoding@0.1.13))(utf-8-validate@6.0.5))(terser@5.39.0) speech/playai: dependencies: @@ -3943,6 +3940,34 @@ importers: specifier: ^5.7.3 version: 5.7.3 + voice/openai: + dependencies: + '@mastra/core': + specifier: workspace:^ + version: link:../../packages/core + openai: + specifier: ^4.28.0 + version: 4.85.2(encoding@0.1.13)(ws@8.18.0(bufferutil@4.0.9)(utf-8-validate@6.0.5))(zod@3.24.2) + zod: + specifier: ^3.24.1 + version: 3.24.2 + devDependencies: + '@microsoft/api-extractor': + specifier: ^7.49.2 + version: 7.50.0(@types/node@22.13.4) + '@types/node': + specifier: ^22.13.1 + version: 22.13.4 + tsup: + specifier: ^8.3.6 + version: 8.3.6(@microsoft/api-extractor@7.50.0(@types/node@22.13.4))(@swc/core@1.10.18(@swc/helpers@0.5.15))(jiti@2.4.2)(postcss@8.5.2)(tsx@4.19.3)(typescript@5.7.3)(yaml@2.7.0) + typescript: + specifier: ^5.7.3 + version: 5.7.3 + vitest: + specifier: ^2.1.8 + version: 2.1.9(@edge-runtime/vm@3.2.0)(@types/node@22.13.4)(jsdom@20.0.3(bufferutil@4.0.9)(canvas@2.11.2(encoding@0.1.13))(utf-8-validate@6.0.5))(terser@5.39.0) + packages: '@ai-sdk/anthropic@1.1.9': diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml index 730506a27e3..9b274d33d0b 100644 --- a/pnpm-workspace.yaml +++ b/pnpm-workspace.yaml @@ -5,6 +5,7 @@ packages: - "vector-stores/*" - "stores/*" - "speech/*" + - "voice/*" - "client-sdks/*" - "!packages/cli/admin" - "integration-generator/*" diff --git a/speech/openai/README.md b/speech/openai/README.md index 0891b8068f5..d6e90171ead 100644 --- a/speech/openai/README.md +++ b/speech/openai/README.md @@ -1,67 +1,25 @@ -# @mastra/speech-openai +# @mastra/speech-openai (DEPRECATED) -OpenAI Speech integration for Mastra, providing Text-to-Speech (TTS) capabilities using OpenAI's advanced speech models. +⚠️ **This package is deprecated.** Please use [@mastra/voice-openai](https://github.com/mastra-ai/mastra/tree/main/voice/openai) instead. -## Installation +## Migration -```bash -npm install @mastra/speech-openai -``` - -## Configuration +The new package `@mastra/voice-openai` provides both Text-to-Speech and Speech-to-Text capabilities. To migrate: -The module requires the following environment variable: +1. Install the new package: ```bash -OPENAI_API_KEY=your_api_key +npm uninstall @mastra/speech-openai +npm install @mastra/voice-openai ``` -## Usage +2. Update your imports: ```typescript +// Old import { OpenAITTS } from '@mastra/speech-openai'; - -// Initialize with configuration -const tts = new OpenAITTS({ - model: { - name: 'alloy', // Default voice - apiKey: 'your-api-key', // Optional, can use OPENAI_API_KEY env var - }, -}); - -// List available voices -const voices = await tts.voices(); - -// Generate speech -const result = await tts.generate({ - voice: 'alloy', - text: 'Hello from Mastra!', -}); - -// Stream speech -const stream = await tts.stream({ - voice: 'alloy', - text: 'Hello from Mastra!', -}); +// New +import { OpenAIVoice } from '@mastra/voice-openai'; ``` -## Features - -- High-quality Text-to-Speech synthesis -- Multiple voice options -- Streaming support -- Natural and expressive speech output -- Fast generation times - -## Voice Options - -OpenAI provides several high-quality voices: - -- alloy (Neutral) -- echo (Male) -- fable (Male) -- onyx (Male) -- nova (Female) -- shimmer (Female) - -View the complete list in the `voices.ts` file or [OpenAI's documentation](https://platform.openai.com/docs/guides/text-to-speech). +For detailed migration instructions and new features, please refer to the [@mastra/voice-openai documentation](https://github.com/mastra-ai/mastra/tree/main/voice/openai). diff --git a/speech/openai/package.json b/speech/openai/package.json index 7924c2c4f48..974b8c08f33 100644 --- a/speech/openai/package.json +++ b/speech/openai/package.json @@ -1,7 +1,7 @@ { "name": "@mastra/speech-openai", "version": "0.1.3-alpha.0", - "description": "Mastra OpenAI speech integration", + "description": "Mastra OpenAI speech integration (deprecated, please use @mastra/voice-openai instead)", "type": "module", "main": "dist/index.js", "types": "dist/index.d.ts", @@ -16,8 +16,7 @@ }, "scripts": { "build": "tsup src/index.ts --format esm --experimental-dts --clean --treeshake", - "build:watch": "pnpm build --watch", - "test": "vitest run" + "test": "echo \"deprecated\"" }, "dependencies": { "@mastra/core": "workspace:^", @@ -28,7 +27,6 @@ "@microsoft/api-extractor": "^7.49.2", "@types/node": "^22.13.1", "tsup": "^8.0.1", - "typescript": "^5.7.3", - "vitest": "^2.1.8" + "typescript": "^5.7.3" } } diff --git a/speech/openai/src/index.test.ts b/speech/openai/src/index.test.ts deleted file mode 100644 index 2e7cfe279fd..00000000000 --- a/speech/openai/src/index.test.ts +++ /dev/null @@ -1,114 +0,0 @@ -import { createWriteStream, writeFileSync } from 'fs'; -import path from 'path'; - -import { OpenAITTS } from './index.js'; - -describe('OpenAITTS Integration Tests', () => { - let tts: OpenAITTS; - - beforeAll(() => { - tts = new OpenAITTS({ - model: { - name: 'tts-1', - }, - }); - }); - - describe('stream', () => { - it('should stream audio data to file', async () => { - const { audioResult } = await tts.stream({ - text: 'Test streaming', - voice: 'alloy', - }); - - return new Promise((resolve, reject) => { - const outputPath = path.join(process.cwd(), 'test-outputs/stream-test.mp3'); - const fileStream = createWriteStream(outputPath); - const chunks: Buffer[] = []; - - audioResult.on('data', (chunk: Buffer) => { - chunks.push(chunk); - }); - - audioResult.pipe(fileStream); - - fileStream.on('finish', () => { - expect(chunks.length).toBeGreaterThan(0); - resolve(undefined); - }); - - audioResult.on('error', reject); - fileStream.on('error', reject); - }); - }), - 50000; - - it('should stream with different parameters and save to file', async () => { - const { audioResult } = await tts.stream({ - text: 'Testing with different voice and speed', - voice: 'nova', - speed: 1.2, - }); - - return new Promise((resolve, reject) => { - const outputPath = path.join(process.cwd(), 'test-outputs/stream-test-params.mp3'); - const fileStream = createWriteStream(outputPath); - - audioResult.pipe(fileStream); - - fileStream.on('finish', resolve); - audioResult.on('error', reject); - fileStream.on('error', reject); - }); - }); - }); - - describe('generate', () => { - it('should return a complete audio buffer and save to file', async () => { - const { audioResult } = await tts.generate({ - text: 'Hello World', - voice: 'alloy', - }); - - expect(Buffer.isBuffer(audioResult)).toBeTruthy(); - expect(audioResult.length).toBeGreaterThan(0); - - const outputPath = path.join(process.cwd(), 'test-outputs/open-aigenerate-test.mp3'); - writeFileSync(outputPath, audioResult); - }); - - it('should work with different parameters and save to file', async () => { - const { audioResult } = await tts.generate({ - text: 'Test with parameters', - voice: 'nova', - speed: 1.5, - }); - - expect(Buffer.isBuffer(audioResult)).toBeTruthy(); - - const outputPath = path.join(process.cwd(), 'test-outputs/open-nova-aigenerate-test.mp3'); - writeFileSync(outputPath, audioResult); - }); - }); - - // Error cases - describe('error handling', () => { - it('should handle invalid voice names', async () => { - await expect( - tts.stream({ - text: 'Test', - voice: 'invalid_voice', - }), - ).rejects.toThrow(); - }); - - it('should handle empty text', async () => { - await expect( - tts.stream({ - text: '', - voice: 'alloy', - }), - ).rejects.toThrow(); - }); - }); -}); diff --git a/speech/openai/src/index.ts b/speech/openai/src/index.ts index e84512585ac..96ca0c09cde 100644 --- a/speech/openai/src/index.ts +++ b/speech/openai/src/index.ts @@ -7,6 +7,10 @@ interface OpenAITTSConfig { apiKey?: string; } +throw new Error( + '@mastra/speech-openai is deprecated. Please use @mastra/voice-openai instead, which provides both Text-to-Speech and Speech-to-Text capabilities.', +); + export class OpenAITTS extends MastraTTS { client: OpenAI; constructor({ model }: { model: OpenAITTSConfig }) { diff --git a/voice/openai/CHANGELOG.md b/voice/openai/CHANGELOG.md new file mode 100644 index 00000000000..71691707c1a --- /dev/null +++ b/voice/openai/CHANGELOG.md @@ -0,0 +1,8 @@ +# @mastra/voice-openai + +## 0.1.0 + +### Changes + +- `@mastra/speech-openai` is now deprecated. Please use `@mastra/voice-openai` instead. +- This package provides both Text-to-Speech (TTS) and Speech-to-Text (STT) capabilities through OpenAI's API. diff --git a/voice/openai/README.md b/voice/openai/README.md new file mode 100644 index 00000000000..47bb1c29b29 --- /dev/null +++ b/voice/openai/README.md @@ -0,0 +1,106 @@ +# @mastra/voice-openai + +OpenAI Voice integration for Mastra, providing both Text-to-Speech (TTS) and Speech-to-Text (STT) capabilities using OpenAI's advanced models. + +## Installation + +```bash +npm install @mastra/voice-openai +``` + +## Configuration + +The module requires an OpenAI API key, which can be provided through environment variables or directly in the configuration: + +```bash +OPENAI_API_KEY=your_api_key +``` + +## Usage + +### Using the Factory Function (Recommended) + +```typescript +import { createOpenAIVoice } from '@mastra/voice-openai'; + +// Create voice with both speech and listening capabilities +const voice = createOpenAIVoice({ + speech: { + model: 'tts-1', // or 'tts-1-hd' for higher quality + apiKey: 'your-api-key', // Optional, can use OPENAI_API_KEY env var + speaker: 'alloy', // Default voice + }, + listening: { + model: 'whisper-1', + apiKey: 'your-api-key', // Optional, can use OPENAI_API_KEY env var + }, +}); + +// Or create speech-only voice +const speechVoice = createOpenAIVoice({ + speech: { + model: 'tts-1', + speaker: 'nova', + }, +}); + +// Or create listening-only voice +const listeningVoice = createOpenAIVoice({ + listening: { + model: 'whisper-1', + }, +}); + +// List available voices +const speakers = await voice.getSpeakers(); + +// Generate speech +const audioStream = await voice.speak('Hello from Mastra!'); + +// Convert speech to text +const text = await voice.listen(audioStream, { + filetype: 'wav', +}); +``` + +### Using the Class Directly + +```typescript +import { OpenAIVoice } from '@mastra/voice-openai'; + +const voice = new OpenAIVoice({ + speechModel: { + name: 'tts-1', + apiKey: 'your-api-key', // Optional, can use OPENAI_API_KEY env var + }, + listeningModel: { + name: 'whisper-1', + apiKey: 'your-api-key', // Optional, can use OPENAI_API_KEY env var + }, + speaker: 'alloy', // Default voice +}); +``` + +## Features + +- High-quality Text-to-Speech synthesis +- Accurate Speech-to-Text transcription +- Multiple voice options +- Natural and expressive speech output +- Fast processing times + +## Voice Options + +OpenAI provides several high-quality voices: + +- alloy (Neutral) +- echo (Male) +- fable (Male) +- onyx (Male) +- nova (Female) +- shimmer (Female) +- ash (Male) +- coral (Female) +- sage (Male) + +View the complete list in OpenAI's [Text to Speech documentation](https://platform.openai.com/docs/guides/text-to-speech). diff --git a/voice/openai/__fixtures__/voice-test.m4a b/voice/openai/__fixtures__/voice-test.m4a new file mode 100644 index 00000000000..515a9a28ee2 Binary files /dev/null and b/voice/openai/__fixtures__/voice-test.m4a differ diff --git a/voice/openai/package.json b/voice/openai/package.json new file mode 100644 index 00000000000..225ce29f621 --- /dev/null +++ b/voice/openai/package.json @@ -0,0 +1,34 @@ +{ + "name": "@mastra/voice-openai", + "version": "0.1.0", + "description": "Mastra OpenAI speech integration", + "type": "module", + "main": "dist/index.js", + "types": "dist/index.d.ts", + "exports": { + ".": { + "import": { + "types": "./dist/index.d.ts", + "default": "./dist/index.js" + } + }, + "./package.json": "./package.json" + }, + "scripts": { + "build": "tsup src/index.ts --format esm --experimental-dts --clean --treeshake", + "build:watch": "pnpm build --watch", + "test": "vitest run" + }, + "dependencies": { + "@mastra/core": "workspace:^", + "openai": "^4.28.0", + "zod": "^3.24.1" + }, + "devDependencies": { + "@microsoft/api-extractor": "^7.49.2", + "@types/node": "^22.13.1", + "tsup": "^8.3.6", + "typescript": "^5.7.3", + "vitest": "^2.1.8" + } +} diff --git a/voice/openai/src/index.test.ts b/voice/openai/src/index.test.ts new file mode 100644 index 00000000000..3e35f751aa1 --- /dev/null +++ b/voice/openai/src/index.test.ts @@ -0,0 +1,263 @@ +import { writeFileSync, mkdirSync, createReadStream } from 'fs'; +import path from 'path'; +import { PassThrough } from 'stream'; +import { describe, expect, it, beforeAll, beforeEach } from 'vitest'; + +import { createOpenAIVoice, OpenAIVoice } from './index.js'; + +describe('createOpenAIVoice', () => { + const outputDir = path.join(process.cwd(), 'test-outputs'); + let apiKey: string; + + beforeAll(() => { + try { + mkdirSync(outputDir, { recursive: true }); + } catch (err) { + // Ignore if directory already exists + } + }); + + beforeEach(() => { + apiKey = process.env.OPENAI_API_KEY || ''; + }); + + it('should create voice with speech capabilities and generate audio', async () => { + const voice = createOpenAIVoice({ + speech: { + model: 'tts-1', + speaker: 'alloy', + }, + }); + + expect(voice.speak).toBeDefined(); + expect(voice.getSpeakers).toBeDefined(); + expect(voice.listen).toBeUndefined(); + + const audioStream = await voice.speak!('Testing speech capabilities'); + const chunks: Buffer[] = []; + for await (const chunk of audioStream) { + chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)); + } + const audioBuffer = Buffer.concat(chunks); + + expect(audioBuffer.length).toBeGreaterThan(0); + writeFileSync(path.join(outputDir, 'factory-speech-test.mp3'), audioBuffer); + }, 10000); + + it('should create voice with listening capabilities and transcribe audio', async () => { + const speechVoice = createOpenAIVoice({ + speech: { model: 'tts-1' }, + }); + + const audioStream = await speechVoice.speak!('Testing transcription capabilities'); + const voice = createOpenAIVoice({ + listening: { + model: 'whisper-1', + }, + }); + + expect(voice.listen).toBeDefined(); + expect(voice.speak).toBeUndefined(); + + const text = await voice.listen!(audioStream, { filetype: 'mp3' }); + expect(text.toLowerCase()).toContain('testing'); + expect(text.toLowerCase()).toContain('transcription'); + }, 15000); + + it('should create voice with both capabilities and round-trip audio', async () => { + const voice = createOpenAIVoice({ + speech: { + model: 'tts-1', + speaker: 'alloy', + }, + listening: { + model: 'whisper-1', + }, + }); + + expect(voice.speak).toBeDefined(); + expect(voice.listen).toBeDefined(); + + const originalText = 'Testing both speech and listening capabilities'; + const audioStream = await voice.speak!(originalText); + const transcribedText = await voice.listen!(audioStream, { filetype: 'mp3' }); + + expect(transcribedText.toLowerCase()).toContain('testing'); + expect(transcribedText.toLowerCase()).toContain('capabilities'); + }, 20000); + + it('should list available speakers', async () => { + const voice = createOpenAIVoice({ + speech: { + model: 'tts-1', + }, + }); + + const speakers = await voice.getSpeakers!(); + expect(speakers).toContainEqual({ voiceId: 'alloy' }); + expect(speakers).toContainEqual({ voiceId: 'nova' }); + expect(speakers.length).toBeGreaterThan(0); + }); + + it('should create voice without any capabilities', () => { + const voice = createOpenAIVoice(); + + expect(voice.speak).toBeUndefined(); + expect(voice.listen).toBeUndefined(); + expect(voice.getSpeakers).toBeUndefined(); + }); +}); + +describe('OpenAIVoice Integration Tests', () => { + let voice: OpenAIVoice; + const outputDir = path.join(process.cwd(), 'test-outputs'); + + beforeAll(() => { + // Create output directory if it doesn't exist + try { + mkdirSync(outputDir, { recursive: true }); + } catch (err) { + // Ignore if directory already exists + } + + voice = new OpenAIVoice({ + speechModel: { + name: 'tts-1', + }, + listeningModel: { + name: 'whisper-1', + }, + }); + }); + + describe('getSpeakers', () => { + it('should list available voices', async () => { + const speakers = await voice.getSpeakers(); + expect(speakers).toContainEqual({ voiceId: 'alloy' }); + expect(speakers).toContainEqual({ voiceId: 'nova' }); + }); + }); + + describe('speak', () => { + it('should generate audio stream from text', async () => { + const audioStream = await voice.speak('Hello World', { + speaker: 'alloy', + }); + + const chunks: Buffer[] = []; + for await (const chunk of audioStream) { + chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)); + } + const audioBuffer = Buffer.concat(chunks); + + expect(audioBuffer.length).toBeGreaterThan(0); + + const outputPath = path.join(outputDir, 'speech-test.mp3'); + writeFileSync(outputPath, audioBuffer); + }, 10000); + + it('should work with different parameters', async () => { + const audioStream = await voice.speak('Test with parameters', { + speaker: 'nova', + speed: 0.5, + }); + + const chunks: Buffer[] = []; + for await (const chunk of audioStream) { + chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)); + } + const audioBuffer = Buffer.concat(chunks); + + expect(audioBuffer.length).toBeGreaterThan(0); + + const outputPath = path.join(outputDir, 'speech-test-params.mp3'); + writeFileSync(outputPath, audioBuffer); + }, 10000); + + it('should accept text stream as input', async () => { + const inputStream = new PassThrough(); + inputStream.end('Hello from stream'); + + const audioStream = await voice.speak(inputStream, { + speaker: 'alloy', + }); + + const chunks: Buffer[] = []; + for await (const chunk of audioStream) { + chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)); + } + const audioBuffer = Buffer.concat(chunks); + + expect(audioBuffer.length).toBeGreaterThan(0); + + const outputPath = path.join(outputDir, 'speech-stream-input.mp3'); + writeFileSync(outputPath, audioBuffer); + }, 10000); + }); + + describe('listen', () => { + it('should transcribe audio from fixture file', async () => { + const fixturePath = path.join(process.cwd(), '__fixtures__', 'voice-test.m4a'); + const audioStream = createReadStream(fixturePath); + + const text = await voice.listen(audioStream, { + filetype: 'm4a', + }); + + expect(text).toBeTruthy(); + console.log(text); + expect(typeof text).toBe('string'); + expect(text.length).toBeGreaterThan(0); + }, 15000); + + it('should transcribe audio stream', async () => { + // First generate some test audio + const audioStream = await voice.speak('This is a test for transcription', { + speaker: 'alloy', + }); + + // Then transcribe it + const text = await voice.listen(audioStream, { + filetype: 'm4a', + }); + + expect(text).toBeTruthy(); + expect(typeof text).toBe('string'); + expect(text.toLowerCase()).toContain('test'); + }, 15000); + + it('should accept options', async () => { + const audioStream = await voice.speak('Test with language option', { + speaker: 'nova', + }); + + const text = await voice.listen(audioStream, { + language: 'en', + filetype: 'm4a', + }); + + expect(text).toBeTruthy(); + expect(typeof text).toBe('string'); + expect(text.toLowerCase()).toContain('test'); + }, 15000); + }); + + // Error cases + describe('error handling', () => { + it('should handle invalid speaker names', async () => { + await expect( + voice.speak('Test', { + speaker: 'invalid_voice', + }), + ).rejects.toThrow(); + }); + + it('should handle empty text', async () => { + await expect( + voice.speak('', { + speaker: 'alloy', + }), + ).rejects.toThrow(); + }); + }); +}); diff --git a/voice/openai/src/index.ts b/voice/openai/src/index.ts new file mode 100644 index 00000000000..05a3b543e62 --- /dev/null +++ b/voice/openai/src/index.ts @@ -0,0 +1,205 @@ +import { MastraVoice } from '@mastra/core/voice'; +import OpenAI from 'openai'; +import { PassThrough } from 'stream'; + +type OpenAIVoiceId = 'alloy' | 'echo' | 'fable' | 'onyx' | 'nova' | 'shimmer' | 'ash' | 'coral' | 'sage'; +type OpenAIModel = 'tts-1' | 'tts-1-hd' | 'whisper-1'; +type OpenAIFileType = 'mp3' | 'mp4' | 'mpeg' | 'mpga' | 'm4a' | 'wav' | 'webm'; + +export interface OpenAIConfig { + name?: OpenAIModel; + apiKey?: string; +} + +export interface OpenAIVoiceConfig { + speech?: { + model: 'tts-1' | 'tts-1-hd'; + apiKey?: string; + speaker?: OpenAIVoiceId; + }; + listening?: { + model: 'whisper-1'; + apiKey?: string; + language?: string; + }; +} + +export interface OpenAIVoiceCapabilities { + speak?: ( + input: string | NodeJS.ReadableStream, + options?: { speaker?: string; speed?: number }, + ) => Promise; + listen?: ( + audioStream: NodeJS.ReadableStream, + options: { filetype: OpenAIFileType; [key: string]: any }, + ) => Promise; + getSpeakers?: () => Promise>; +} + +/** + * Creates OpenAI voice capabilities + */ +export function createOpenAIVoice(config?: OpenAIVoiceConfig): OpenAIVoiceCapabilities { + const provider = new OpenAIVoice({ + speechModel: config?.speech + ? { + name: config.speech.model, + apiKey: config.speech.apiKey, + } + : undefined, + listeningModel: config?.listening + ? { + name: config.listening.model, + apiKey: config.listening.apiKey, + } + : undefined, + speaker: config?.speech?.speaker, + }); + + return { + speak: config?.speech ? provider.speak.bind(provider) : undefined, + listen: config?.listening ? provider.listen.bind(provider) : undefined, + getSpeakers: config?.speech ? provider.getSpeakers.bind(provider) : undefined, + }; +} + +export class OpenAIVoice extends MastraVoice { + speechClient?: OpenAI; + listeningClient?: OpenAI; + + constructor({ + listeningModel, + speechModel, + speaker, + }: { + listeningModel?: OpenAIConfig; + speechModel?: OpenAIConfig; + speaker?: string; + }) { + super({ + speechModel: speechModel && { + name: speechModel.name || 'tts-1', + apiKey: speechModel.apiKey, + }, + listeningModel: listeningModel && { + name: listeningModel.name || 'whisper-1', + apiKey: listeningModel.apiKey, + }, + speaker, + }); + + const defaultApiKey = process.env.OPENAI_API_KEY; + + if (speechModel || defaultApiKey) { + const speechApiKey = speechModel?.apiKey || defaultApiKey; + if (!speechApiKey) { + throw new Error('No API key provided for speech model'); + } + this.speechClient = new OpenAI({ apiKey: speechApiKey }); + } + + if (listeningModel || defaultApiKey) { + const listeningApiKey = listeningModel?.apiKey || defaultApiKey; + if (!listeningApiKey) { + throw new Error('No API key provided for listening model'); + } + this.listeningClient = new OpenAI({ apiKey: listeningApiKey }); + } + + if (!this.speechClient && !this.listeningClient) { + throw new Error('At least one of OPENAI_API_KEY, speechModel.apiKey, or listeningModel.apiKey must be set'); + } + } + + async getSpeakers(): Promise> { + if (!this.speechModel) { + throw new Error('Speech model not configured'); + } + + return [ + { voiceId: 'alloy' }, + { voiceId: 'echo' }, + { voiceId: 'fable' }, + { voiceId: 'onyx' }, + { voiceId: 'nova' }, + { voiceId: 'shimmer' }, + { voiceId: 'ash' }, + { voiceId: 'coral' }, + { voiceId: 'sage' }, + ]; + } + + async speak( + input: string | NodeJS.ReadableStream, + options?: { + speaker?: string; + speed?: number; + [key: string]: any; + }, + ): Promise { + if (!this.speechClient) { + throw new Error('Speech model not configured'); + } + + if (typeof input !== 'string') { + const chunks: Buffer[] = []; + for await (const chunk of input) { + chunks.push(Buffer.from(chunk)); + } + input = Buffer.concat(chunks).toString('utf-8'); + } + + if (input.trim().length === 0) { + throw new Error('Input text is empty'); + } + + const audio = await this.traced(async () => { + const response = await this.speechClient!.audio.speech.create({ + model: this.speechModel?.name || 'tts-1', + voice: (options?.speaker || 'alloy') as OpenAIVoiceId, + input, + speed: options?.speed || 1.0, + }); + + const passThrough = new PassThrough(); + const buffer = Buffer.from(await response.arrayBuffer()); + passThrough.end(buffer); + return passThrough; + }, 'voice.openai.speak')(); + + return audio; + } + + async listen( + audioStream: NodeJS.ReadableStream, + options: { + filetype: 'mp3' | 'mp4' | 'mpeg' | 'mpga' | 'm4a' | 'wav' | 'webm'; + [key: string]: any; + }, + ): Promise { + if (!this.listeningClient) { + throw new Error('Listening model not configured'); + } + + const chunks: Buffer[] = []; + for await (const chunk of audioStream) { + chunks.push(Buffer.from(chunk)); + } + const audioBuffer = Buffer.concat(chunks); + + const text = await this.traced(async () => { + const { filetype, ...otherOptions } = options || {}; + const file = new File([audioBuffer], `audio.${filetype}`); + + const response = await this.listeningClient!.audio.transcriptions.create({ + model: this.listeningModel?.name || 'whisper-1', + file: file as any, + ...otherOptions, + }); + + return response.text; + }, 'voice.openai.listen')(); + + return text; + } +} diff --git a/voice/openai/test-outputs/speech-stream-input.mp3 b/voice/openai/test-outputs/speech-stream-input.mp3 new file mode 100644 index 00000000000..a36ab1de83d Binary files /dev/null and b/voice/openai/test-outputs/speech-stream-input.mp3 differ diff --git a/voice/openai/test-outputs/speech-test-params.mp3 b/voice/openai/test-outputs/speech-test-params.mp3 new file mode 100644 index 00000000000..33fa259e0f6 Binary files /dev/null and b/voice/openai/test-outputs/speech-test-params.mp3 differ diff --git a/voice/openai/test-outputs/speech-test.mp3 b/voice/openai/test-outputs/speech-test.mp3 new file mode 100644 index 00000000000..78a5dd9e341 Binary files /dev/null and b/voice/openai/test-outputs/speech-test.mp3 differ diff --git a/voice/openai/tsconfig.json b/voice/openai/tsconfig.json new file mode 100644 index 00000000000..6750fddcd4d --- /dev/null +++ b/voice/openai/tsconfig.json @@ -0,0 +1,5 @@ +{ + "extends": "../../tsconfig.node.json", + "include": ["src/**/*"], + "exclude": ["node_modules", "**/*.test.ts"] +} diff --git a/speech/openai/vitest.config.ts b/voice/openai/vitest.config.ts similarity index 100% rename from speech/openai/vitest.config.ts rename to voice/openai/vitest.config.ts