feat(server): add voice session handler with Groq Whisper STT

2026-02-20 01:57:53 +05:30
parent 4a128f7719
commit 63276d3573
1 changed files with 233 additions and 0 deletions
--- a/server/src/ws/voice.ts
+++ b/server/src/ws/voice.ts
@@ -0,0 +1,233 @@
 import type { ServerWebSocket } from "bun";
 import type { WebSocketData } from "./sessions.js";
 // ── Types ────────────────────────────────────────────────
 interface VoiceSession {
  chunks: Buffer[];
  totalBytes: number;
  partialTimer: ReturnType<typeof setInterval> | null;
  lastPartialOffset: number;
 }
 // ── State ────────────────────────────────────────────────
 const activeSessions = new Map<string, VoiceSession>();
 // ── Audio constants ──────────────────────────────────────
 const SAMPLE_RATE = 16_000;
 const CHANNELS = 1;
 const BITS_PER_SAMPLE = 16;
 const PARTIAL_INTERVAL_MS = 2_000;
 /** Minimum bytes before attempting first transcription (100ms of 16kHz mono 16-bit) */
 const MIN_AUDIO_BYTES = 3_200;
 // ── Exported handlers ────────────────────────────────────
 /**
 * Start a voice session for a device. Creates a buffer and starts a
 * periodic timer that sends accumulated audio to Groq Whisper for
 * partial transcripts every ~2s.
 */
 export function handleVoiceStart(
  ws: ServerWebSocket<WebSocketData>,
  deviceId: string,
  groqApiKey: string
 ): void {
  // Clean up any existing session for this device
  cleanupSession(deviceId);
  const session: VoiceSession = {
    chunks: [],
    totalBytes: 0,
    partialTimer: null,
    lastPartialOffset: 0,
  };
  activeSessions.set(deviceId, session);
  // Start periodic partial transcription
  session.partialTimer = setInterval(async () => {
    // Only transcribe if there's new audio since last partial
    if (session.totalBytes <= session.lastPartialOffset) return;
    if (session.totalBytes < MIN_AUDIO_BYTES) return;
    try {
      const pcm = concatChunks(session.chunks);
      const text = await transcribeAudio(pcm, groqApiKey);
      session.lastPartialOffset = session.totalBytes;
      if (text) {
        sendToDevice(ws, { type: "transcript_partial", text });
      }
    } catch (err) {
      console.error(`[Voice] Partial transcription failed for ${deviceId}:`, err);
    }
  }, PARTIAL_INTERVAL_MS);
 }
 /**
 * Append a base64-encoded PCM audio chunk to the session buffer.
 */
 export function handleVoiceChunk(deviceId: string, base64Data: string): void {
  const session = activeSessions.get(deviceId);
  if (!session) {
    console.warn(`[Voice] Chunk received for unknown session: ${deviceId}`);
    return;
  }
  const decoded = Buffer.from(base64Data, "base64");
  session.chunks.push(decoded);
  session.totalBytes += decoded.length;
 }
 /**
 * Stop the partial timer, send the complete audio to Groq for a final
 * transcript, relay it to the device, clean up, and return the text.
 */
 export async function handleVoiceSend(
  ws: ServerWebSocket<WebSocketData>,
  deviceId: string,
  groqApiKey: string
 ): Promise<string> {
  const session = activeSessions.get(deviceId);
  if (!session) {
    console.warn(`[Voice] Send requested for unknown session: ${deviceId}`);
    return "";
  }
  // Stop partial timer
  if (session.partialTimer !== null) {
    clearInterval(session.partialTimer);
    session.partialTimer = null;
  }
  let transcript = "";
  if (session.totalBytes >= MIN_AUDIO_BYTES) {
    try {
      const pcm = concatChunks(session.chunks);
      transcript = await transcribeAudio(pcm, groqApiKey);
    } catch (err) {
      console.error(`[Voice] Final transcription failed for ${deviceId}:`, err);
    }
  }
  sendToDevice(ws, { type: "transcript_final", text: transcript });
  // Clean up session
  activeSessions.delete(deviceId);
  return transcript;
 }
 /**
 * Cancel a voice session: stop the timer and discard all audio.
 */
 export function handleVoiceCancel(deviceId: string): void {
  cleanupSession(deviceId);
 }
 // ── Internal helpers ─────────────────────────────────────
 /**
 * Concatenate all buffered chunks into a single Buffer.
 */
 function concatChunks(chunks: Buffer[]): Buffer {
  return Buffer.concat(chunks);
 }
 /**
 * Clean up and remove a voice session.
 */
 function cleanupSession(deviceId: string): void {
  const session = activeSessions.get(deviceId);
  if (!session) return;
  if (session.partialTimer !== null) {
    clearInterval(session.partialTimer);
  }
  activeSessions.delete(deviceId);
 }
 /**
 * Wrap raw PCM data in a WAV container and send it to Groq's
 * Whisper API for transcription. Returns the transcribed text.
 */
 async function transcribeAudio(pcmBuffer: Buffer, apiKey: string): Promise<string> {
  const wav = pcmToWav(pcmBuffer, SAMPLE_RATE, CHANNELS, BITS_PER_SAMPLE);
  const formData = new FormData();
  const wavBytes = new Uint8Array(wav.buffer, wav.byteOffset, wav.byteLength) as BlobPart;
  formData.append("file", new Blob([wavBytes], { type: "audio/wav" }), "audio.wav");
  formData.append("model", "whisper-large-v3");
  const response = await fetch("https://api.groq.com/openai/v1/audio/transcriptions", {
    method: "POST",
    headers: {
      Authorization: `Bearer ${apiKey}`,
    },
    body: formData,
  });
  if (!response.ok) {
    const body = await response.text();
    throw new Error(`Groq Whisper API error ${response.status}: ${body}`);
  }
  const result = (await response.json()) as { text: string };
  return result.text ?? "";
 }
 /**
 * Create a 44-byte WAV header + PCM data buffer.
 */
 function pcmToWav(
  pcm: Buffer,
  sampleRate: number,
  channels: number,
  bitsPerSample: number
 ): Buffer {
  const byteRate = (sampleRate * channels * bitsPerSample) / 8;
  const blockAlign = (channels * bitsPerSample) / 8;
  const dataSize = pcm.length;
  const headerSize = 44;
  const buffer = Buffer.alloc(headerSize + dataSize);
  // RIFF header
  buffer.write("RIFF", 0);
  buffer.writeUInt32LE(36 + dataSize, 4); // file size - 8
  buffer.write("WAVE", 8);
  // fmt subchunk
  buffer.write("fmt ", 12);
  buffer.writeUInt32LE(16, 16); // subchunk1 size (PCM = 16)
  buffer.writeUInt16LE(1, 20); // audio format (PCM = 1)
  buffer.writeUInt16LE(channels, 22);
  buffer.writeUInt32LE(sampleRate, 24);
  buffer.writeUInt32LE(byteRate, 28);
  buffer.writeUInt16LE(blockAlign, 32);
  buffer.writeUInt16LE(bitsPerSample, 34);
  // data subchunk
  buffer.write("data", 36);
  buffer.writeUInt32LE(dataSize, 40);
  // PCM data
  pcm.copy(buffer, headerSize);
  return buffer;
 }
 /**
 * Send a JSON message to a device WebSocket (safe — catches send errors).
 */
 function sendToDevice(ws: ServerWebSocket<WebSocketData>, msg: Record<string, unknown>): void {
  try {
    ws.send(JSON.stringify(msg));
  } catch {
    // device disconnected
  }
 }