diff --git a/package.json b/package.json index 742c89c..b1b6108 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "assemblyai", - "version": "4.24.0", + "version": "4.25.0", "description": "The AssemblyAI JavaScript SDK provides an easy-to-use interface for interacting with the AssemblyAI API, which supports async and real-time transcription, as well as the latest LeMUR models.", "engines": { "node": ">=18" diff --git a/src/services/streaming/service.ts b/src/services/streaming/service.ts index 9cbaeca..0955a52 100644 --- a/src/services/streaming/service.ts +++ b/src/services/streaming/service.ts @@ -165,6 +165,14 @@ export class StreamingTranscriber { ); } + if (this.params.speakerLabels !== undefined) { + searchParams.set("speaker_labels", this.params.speakerLabels.toString()); + } + + if (this.params.maxSpeakers !== undefined) { + searchParams.set("max_speakers", this.params.maxSpeakers.toString()); + } + url.search = searchParams.toString(); return url; diff --git a/src/types/streaming/index.ts b/src/types/streaming/index.ts index 05e0c97..d37f59b 100644 --- a/src/types/streaming/index.ts +++ b/src/types/streaming/index.ts @@ -22,6 +22,8 @@ export type StreamingTranscriberParams = { speechModel?: StreamingSpeechModel; languageDetection?: boolean; inactivityTimeout?: number; + speakerLabels?: boolean; + maxSpeakers?: number; }; export type StreamingEvents = "open" | "close" | "turn" | "error"; @@ -37,6 +39,7 @@ export type StreamingSpeechModel = | "universal-streaming-english" | "universal-streaming-multilingual" | "u3-rt-pro" + | "whisper-rt" | "u3-pro"; export type StreamingTokenParams = { @@ -66,6 +69,7 @@ export type TurnEvent = { words: StreamingWord[]; language_code?: string; language_confidence?: number; + speaker_label?: string; }; export type StreamingWord = { diff --git a/tests/unit/streaming.test.ts b/tests/unit/streaming.test.ts index a2759e0..01a498d 100644 --- a/tests/unit/streaming.test.ts +++ b/tests/unit/streaming.test.ts @@ -59,4 +59,76 @@ describe("streaming", () => { } it("noop", async () => {}); + + it("should include speaker_labels in connection URL", async () => { + await cleanup(); + WS.clean(); + + const wsUrl = `${websocketBaseUrl}?token=123&sample_rate=16000&speaker_labels=true`; + server = new WS(wsUrl); + rt = new StreamingTranscriber({ + websocketBaseUrl, + token: "123", + sampleRate: 16_000, + speakerLabels: true, + }); + onOpen = jest.fn(); + rt.on("open", onOpen); + await connect(rt, server); + }); + + it("should include speaker_labels and max_speakers in connection URL", async () => { + await cleanup(); + WS.clean(); + + const wsUrl = `${websocketBaseUrl}?token=123&sample_rate=16000&speaker_labels=true&max_speakers=4`; + server = new WS(wsUrl); + rt = new StreamingTranscriber({ + websocketBaseUrl, + token: "123", + sampleRate: 16_000, + speakerLabels: true, + maxSpeakers: 4, + }); + onOpen = jest.fn(); + rt.on("open", onOpen); + await connect(rt, server); + }); + + it("should include whisper-rt speech model in connection URL", async () => { + await cleanup(); + WS.clean(); + + const wsUrl = `${websocketBaseUrl}?token=123&sample_rate=16000&speech_model=whisper-rt`; + server = new WS(wsUrl); + rt = new StreamingTranscriber({ + websocketBaseUrl, + token: "123", + sampleRate: 16_000, + speechModel: "whisper-rt", + }); + onOpen = jest.fn(); + rt.on("open", onOpen); + await connect(rt, server); + }); + + it("should parse speaker_label from turn event", async () => { + const turnPromise = new Promise<{ speaker_label?: string }>((resolve) => { + rt.on("turn", (event) => resolve(event)); + }); + server.send( + JSON.stringify({ + type: "Turn", + turn_order: 1, + turn_is_formatted: true, + end_of_turn: true, + transcript: "hello", + end_of_turn_confidence: 0.9, + words: [], + speaker_label: "A", + }), + ); + const turn = await turnPromise; + expect(turn.speaker_label).toBe("A"); + }); });