feat(server): add voice session handler with Groq Whisper STT
This commit is contained in:
233
server/src/ws/voice.ts
Normal file
233
server/src/ws/voice.ts
Normal file
@@ -0,0 +1,233 @@
|
||||
import type { ServerWebSocket } from "bun";
|
||||
import type { WebSocketData } from "./sessions.js";
|
||||
|
||||
// ── Types ────────────────────────────────────────────────
|
||||
|
||||
interface VoiceSession {
|
||||
chunks: Buffer[];
|
||||
totalBytes: number;
|
||||
partialTimer: ReturnType<typeof setInterval> | null;
|
||||
lastPartialOffset: number;
|
||||
}
|
||||
|
||||
// ── State ────────────────────────────────────────────────
|
||||
|
||||
const activeSessions = new Map<string, VoiceSession>();
|
||||
|
||||
// ── Audio constants ──────────────────────────────────────
|
||||
|
||||
const SAMPLE_RATE = 16_000;
|
||||
const CHANNELS = 1;
|
||||
const BITS_PER_SAMPLE = 16;
|
||||
const PARTIAL_INTERVAL_MS = 2_000;
|
||||
/** Minimum bytes before attempting first transcription (100ms of 16kHz mono 16-bit) */
|
||||
const MIN_AUDIO_BYTES = 3_200;
|
||||
|
||||
// ── Exported handlers ────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Start a voice session for a device. Creates a buffer and starts a
|
||||
* periodic timer that sends accumulated audio to Groq Whisper for
|
||||
* partial transcripts every ~2s.
|
||||
*/
|
||||
export function handleVoiceStart(
|
||||
ws: ServerWebSocket<WebSocketData>,
|
||||
deviceId: string,
|
||||
groqApiKey: string
|
||||
): void {
|
||||
// Clean up any existing session for this device
|
||||
cleanupSession(deviceId);
|
||||
|
||||
const session: VoiceSession = {
|
||||
chunks: [],
|
||||
totalBytes: 0,
|
||||
partialTimer: null,
|
||||
lastPartialOffset: 0,
|
||||
};
|
||||
|
||||
activeSessions.set(deviceId, session);
|
||||
|
||||
// Start periodic partial transcription
|
||||
session.partialTimer = setInterval(async () => {
|
||||
// Only transcribe if there's new audio since last partial
|
||||
if (session.totalBytes <= session.lastPartialOffset) return;
|
||||
if (session.totalBytes < MIN_AUDIO_BYTES) return;
|
||||
|
||||
try {
|
||||
const pcm = concatChunks(session.chunks);
|
||||
const text = await transcribeAudio(pcm, groqApiKey);
|
||||
session.lastPartialOffset = session.totalBytes;
|
||||
|
||||
if (text) {
|
||||
sendToDevice(ws, { type: "transcript_partial", text });
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(`[Voice] Partial transcription failed for ${deviceId}:`, err);
|
||||
}
|
||||
}, PARTIAL_INTERVAL_MS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Append a base64-encoded PCM audio chunk to the session buffer.
|
||||
*/
|
||||
export function handleVoiceChunk(deviceId: string, base64Data: string): void {
|
||||
const session = activeSessions.get(deviceId);
|
||||
if (!session) {
|
||||
console.warn(`[Voice] Chunk received for unknown session: ${deviceId}`);
|
||||
return;
|
||||
}
|
||||
|
||||
const decoded = Buffer.from(base64Data, "base64");
|
||||
session.chunks.push(decoded);
|
||||
session.totalBytes += decoded.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop the partial timer, send the complete audio to Groq for a final
|
||||
* transcript, relay it to the device, clean up, and return the text.
|
||||
*/
|
||||
export async function handleVoiceSend(
|
||||
ws: ServerWebSocket<WebSocketData>,
|
||||
deviceId: string,
|
||||
groqApiKey: string
|
||||
): Promise<string> {
|
||||
const session = activeSessions.get(deviceId);
|
||||
if (!session) {
|
||||
console.warn(`[Voice] Send requested for unknown session: ${deviceId}`);
|
||||
return "";
|
||||
}
|
||||
|
||||
// Stop partial timer
|
||||
if (session.partialTimer !== null) {
|
||||
clearInterval(session.partialTimer);
|
||||
session.partialTimer = null;
|
||||
}
|
||||
|
||||
let transcript = "";
|
||||
|
||||
if (session.totalBytes >= MIN_AUDIO_BYTES) {
|
||||
try {
|
||||
const pcm = concatChunks(session.chunks);
|
||||
transcript = await transcribeAudio(pcm, groqApiKey);
|
||||
} catch (err) {
|
||||
console.error(`[Voice] Final transcription failed for ${deviceId}:`, err);
|
||||
}
|
||||
}
|
||||
|
||||
sendToDevice(ws, { type: "transcript_final", text: transcript });
|
||||
|
||||
// Clean up session
|
||||
activeSessions.delete(deviceId);
|
||||
|
||||
return transcript;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cancel a voice session: stop the timer and discard all audio.
|
||||
*/
|
||||
export function handleVoiceCancel(deviceId: string): void {
|
||||
cleanupSession(deviceId);
|
||||
}
|
||||
|
||||
// ── Internal helpers ─────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Concatenate all buffered chunks into a single Buffer.
|
||||
*/
|
||||
function concatChunks(chunks: Buffer[]): Buffer {
|
||||
return Buffer.concat(chunks);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up and remove a voice session.
|
||||
*/
|
||||
function cleanupSession(deviceId: string): void {
|
||||
const session = activeSessions.get(deviceId);
|
||||
if (!session) return;
|
||||
|
||||
if (session.partialTimer !== null) {
|
||||
clearInterval(session.partialTimer);
|
||||
}
|
||||
activeSessions.delete(deviceId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Wrap raw PCM data in a WAV container and send it to Groq's
|
||||
* Whisper API for transcription. Returns the transcribed text.
|
||||
*/
|
||||
async function transcribeAudio(pcmBuffer: Buffer, apiKey: string): Promise<string> {
|
||||
const wav = pcmToWav(pcmBuffer, SAMPLE_RATE, CHANNELS, BITS_PER_SAMPLE);
|
||||
|
||||
const formData = new FormData();
|
||||
const wavBytes = new Uint8Array(wav.buffer, wav.byteOffset, wav.byteLength) as BlobPart;
|
||||
formData.append("file", new Blob([wavBytes], { type: "audio/wav" }), "audio.wav");
|
||||
formData.append("model", "whisper-large-v3");
|
||||
|
||||
const response = await fetch("https://api.groq.com/openai/v1/audio/transcriptions", {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
},
|
||||
body: formData,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const body = await response.text();
|
||||
throw new Error(`Groq Whisper API error ${response.status}: ${body}`);
|
||||
}
|
||||
|
||||
const result = (await response.json()) as { text: string };
|
||||
return result.text ?? "";
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a 44-byte WAV header + PCM data buffer.
|
||||
*/
|
||||
function pcmToWav(
|
||||
pcm: Buffer,
|
||||
sampleRate: number,
|
||||
channels: number,
|
||||
bitsPerSample: number
|
||||
): Buffer {
|
||||
const byteRate = (sampleRate * channels * bitsPerSample) / 8;
|
||||
const blockAlign = (channels * bitsPerSample) / 8;
|
||||
const dataSize = pcm.length;
|
||||
const headerSize = 44;
|
||||
|
||||
const buffer = Buffer.alloc(headerSize + dataSize);
|
||||
|
||||
// RIFF header
|
||||
buffer.write("RIFF", 0);
|
||||
buffer.writeUInt32LE(36 + dataSize, 4); // file size - 8
|
||||
buffer.write("WAVE", 8);
|
||||
|
||||
// fmt subchunk
|
||||
buffer.write("fmt ", 12);
|
||||
buffer.writeUInt32LE(16, 16); // subchunk1 size (PCM = 16)
|
||||
buffer.writeUInt16LE(1, 20); // audio format (PCM = 1)
|
||||
buffer.writeUInt16LE(channels, 22);
|
||||
buffer.writeUInt32LE(sampleRate, 24);
|
||||
buffer.writeUInt32LE(byteRate, 28);
|
||||
buffer.writeUInt16LE(blockAlign, 32);
|
||||
buffer.writeUInt16LE(bitsPerSample, 34);
|
||||
|
||||
// data subchunk
|
||||
buffer.write("data", 36);
|
||||
buffer.writeUInt32LE(dataSize, 40);
|
||||
|
||||
// PCM data
|
||||
pcm.copy(buffer, headerSize);
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Send a JSON message to a device WebSocket (safe — catches send errors).
|
||||
*/
|
||||
function sendToDevice(ws: ServerWebSocket<WebSocketData>, msg: Record<string, unknown>): void {
|
||||
try {
|
||||
ws.send(JSON.stringify(msg));
|
||||
} catch {
|
||||
// device disconnected
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user