From 669aa3d9b1b2f460f0d6a35ca031a0b6410a989b Mon Sep 17 00:00:00 2001 From: Sanju Sivalingam Date: Fri, 20 Feb 2026 01:52:40 +0530 Subject: [PATCH] docs: add voice overlay implementation plan 11 tasks covering shared protocol types, server Groq Whisper STT handler, Android VoiceRecorder, gradient border overlay, voice panel UI, AgentOverlay state machine, ConnectionService wiring, and permission handling. --- docs/plans/2026-02-20-voice-overlay-plan.md | 1509 +++++++++++++++++++ 1 file changed, 1509 insertions(+) create mode 100644 docs/plans/2026-02-20-voice-overlay-plan.md diff --git a/docs/plans/2026-02-20-voice-overlay-plan.md b/docs/plans/2026-02-20-voice-overlay-plan.md new file mode 100644 index 0000000..3246c86 --- /dev/null +++ b/docs/plans/2026-02-20-voice-overlay-plan.md @@ -0,0 +1,1509 @@ +# Voice Overlay Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Add voice-activated overlay to the Android app — tap pill, speak a goal, send it for execution. + +**Architecture:** Android captures audio via `AudioRecord`, streams base64 PCM chunks over the existing WebSocket to the Bun server. Server buffers chunks and calls Groq Whisper API every ~2s for partial transcripts (fast batch, not true streaming). On "send", server gets final transcript and fires it into `runPipeline()` — the same path as web dashboard goals. Android overlay shows a glowing gradient border during listening and renders live transcript text. + +**Tech Stack:** Kotlin/Compose (Android), Bun/TypeScript (server), Groq Whisper API (STT), Ktor WebSocket (transport) + +**Design doc:** `docs/plans/2026-02-20-voice-overlay-design.md` + +--- + +### Task 1: Add Voice Protocol Types (shared package) + +**Files:** +- Modify: `packages/shared/src/protocol.ts:1-46` + +**Step 1: Add voice message types to DeviceMessage union** + +In `packages/shared/src/protocol.ts`, add three new variants to the `DeviceMessage` type after the `stop_goal` line (line 11): + +```typescript +export type DeviceMessage = + | { type: "auth"; apiKey: string; deviceInfo?: DeviceInfo } + | { type: "screen"; requestId: string; elements: UIElement[]; screenshot?: string; packageName?: string } + | { type: "result"; requestId: string; success: boolean; error?: string; data?: string } + | { type: "goal"; text: string } + | { type: "pong" } + | { type: "heartbeat"; batteryLevel: number; isCharging: boolean } + | { type: "apps"; apps: InstalledApp[] } + | { type: "stop_goal" } + // Voice overlay + | { type: "voice_start" } + | { type: "voice_chunk"; data: string } + | { type: "voice_stop"; action: "send" | "cancel" }; +``` + +**Step 2: Add voice response types to ServerToDeviceMessage union** + +Add two new variants after the `goal_completed` line (line 38): + +```typescript + // Voice overlay + | { type: "transcript_partial"; text: string } + | { type: "transcript_final"; text: string } +``` + +**Step 3: Verify types compile** + +Run: `cd packages/shared && bun run tsc --noEmit` +Expected: No errors + +**Step 4: Commit** + +```bash +git add packages/shared/src/protocol.ts +git commit -m "feat(shared): add voice overlay protocol types" +``` + +--- + +### Task 2: Server Voice Handler + +**Files:** +- Create: `server/src/ws/voice.ts` + +**Step 1: Create the voice session handler** + +This file manages one voice session per device: buffers audio chunks, calls Groq Whisper batch API for partial transcripts every ~2s, and produces the final transcript on stop. + +```typescript +/** + * Voice session handler for DroidClaw. + * + * Buffers PCM audio chunks from the Android device, periodically sends + * accumulated audio to Groq Whisper for partial transcription, and + * produces a final transcript when the user taps Send. + * + * Uses Groq's OpenAI-compatible /audio/transcriptions endpoint (batch, + * not streaming) — but sends intermediate chunks every ~2s to simulate + * live streaming text. + */ + +import type { ServerWebSocket } from "bun"; +import type { WebSocketData } from "./sessions.js"; + +interface VoiceSession { + chunks: Buffer[]; + totalBytes: number; + partialTimer: ReturnType | null; + lastPartialOffset: number; // byte offset of last partial sent +} + +const activeSessions = new Map(); + +const GROQ_API_URL = "https://api.groq.com/openai/v1/audio/transcriptions"; +const PARTIAL_INTERVAL_MS = 2000; +const WHISPER_MODEL = "whisper-large-v3"; + +/** + * Call Groq Whisper transcription API with a PCM audio buffer. + * Returns the transcribed text, or empty string on failure. + */ +async function transcribeAudio( + pcmBuffer: Buffer, + apiKey: string +): Promise { + // Convert raw PCM (16kHz, mono, 16-bit) to WAV for the API + const wavBuffer = pcmToWav(pcmBuffer, 16000, 1, 16); + + const formData = new FormData(); + formData.append("file", new Blob([wavBuffer], { type: "audio/wav" }), "audio.wav"); + formData.append("model", WHISPER_MODEL); + formData.append("language", "en"); + formData.append("response_format", "text"); + + try { + const res = await fetch(GROQ_API_URL, { + method: "POST", + headers: { Authorization: `Bearer ${apiKey}` }, + body: formData, + }); + if (!res.ok) { + console.error(`[Voice] Groq API error: ${res.status} ${await res.text()}`); + return ""; + } + return (await res.text()).trim(); + } catch (err) { + console.error(`[Voice] Transcription failed: ${err}`); + return ""; + } +} + +/** + * Wrap raw PCM data in a WAV header. + */ +function pcmToWav( + pcm: Buffer, + sampleRate: number, + channels: number, + bitsPerSample: number +): Buffer { + const byteRate = (sampleRate * channels * bitsPerSample) / 8; + const blockAlign = (channels * bitsPerSample) / 8; + const header = Buffer.alloc(44); + + header.write("RIFF", 0); + header.writeUInt32LE(36 + pcm.length, 4); + header.write("WAVE", 8); + header.write("fmt ", 12); + header.writeUInt32LE(16, 16); // PCM chunk size + header.writeUInt16LE(1, 20); // PCM format + header.writeUInt16LE(channels, 22); + header.writeUInt32LE(sampleRate, 24); + header.writeUInt32LE(byteRate, 28); + header.writeUInt16LE(blockAlign, 32); + header.writeUInt16LE(bitsPerSample, 34); + header.write("data", 36); + header.writeUInt32LE(pcm.length, 40); + + return Buffer.concat([header, pcm]); +} + +function sendToDevice(ws: ServerWebSocket, msg: Record) { + try { + ws.send(JSON.stringify(msg)); + } catch { /* disconnected */ } +} + +/** + * Start a voice session for a device. + */ +export function handleVoiceStart( + ws: ServerWebSocket, + deviceId: string, + groqApiKey: string +) { + // Clean up any existing session + handleVoiceCancel(deviceId); + + const session: VoiceSession = { + chunks: [], + totalBytes: 0, + partialTimer: null, + lastPartialOffset: 0, + }; + + // Start periodic partial transcription + session.partialTimer = setInterval(async () => { + if (session.totalBytes <= session.lastPartialOffset) return; + if (session.totalBytes < 3200) return; // need at least 100ms of audio (16kHz * 2 bytes * 0.1s) + + const fullBuffer = Buffer.concat(session.chunks); + const text = await transcribeAudio(fullBuffer, groqApiKey); + if (text) { + session.lastPartialOffset = session.totalBytes; + sendToDevice(ws, { type: "transcript_partial", text }); + } + }, PARTIAL_INTERVAL_MS); + + activeSessions.set(deviceId, session); + console.log(`[Voice] Session started for device ${deviceId}`); +} + +/** + * Append an audio chunk to the session buffer. + */ +export function handleVoiceChunk(deviceId: string, base64Data: string) { + const session = activeSessions.get(deviceId); + if (!session) return; + + const chunk = Buffer.from(base64Data, "base64"); + session.chunks.push(chunk); + session.totalBytes += chunk.length; +} + +/** + * Finalize the voice session: get final transcript and fire as goal. + * Returns the final transcript text. + */ +export async function handleVoiceSend( + ws: ServerWebSocket, + deviceId: string, + groqApiKey: string +): Promise { + const session = activeSessions.get(deviceId); + if (!session) return ""; + + // Stop partial timer + if (session.partialTimer) clearInterval(session.partialTimer); + + // Get final transcript from complete audio + const fullBuffer = Buffer.concat(session.chunks); + const text = await transcribeAudio(fullBuffer, groqApiKey); + + // Send final transcript to device + sendToDevice(ws, { type: "transcript_final", text }); + + // Clean up + activeSessions.delete(deviceId); + console.log(`[Voice] Session finalized for device ${deviceId}: "${text}"`); + + return text; +} + +/** + * Cancel and discard a voice session. + */ +export function handleVoiceCancel(deviceId: string) { + const session = activeSessions.get(deviceId); + if (!session) return; + + if (session.partialTimer) clearInterval(session.partialTimer); + activeSessions.delete(deviceId); + console.log(`[Voice] Session cancelled for device ${deviceId}`); +} +``` + +**Step 2: Verify types compile** + +Run: `cd server && bun run typecheck` +Expected: No errors + +**Step 3: Commit** + +```bash +git add server/src/ws/voice.ts +git commit -m "feat(server): add voice session handler with Groq Whisper STT" +``` + +--- + +### Task 3: Wire Voice Messages into Server Device Handler + +**Files:** +- Modify: `server/src/ws/device.ts:191-370` + +**Step 1: Import voice handler functions** + +At the top of `server/src/ws/device.ts`, add after line 7: + +```typescript +import { + handleVoiceStart, + handleVoiceChunk, + handleVoiceSend, + handleVoiceCancel, +} from "./voice.js"; +``` + +**Step 2: Add voice cases to the switch statement** + +In `handleDeviceMessage()`, add new cases inside the `switch (msg.type)` block, before the `default` case (before line 364): + +```typescript + case "voice_start": { + const deviceId = ws.data.deviceId!; + const userId = ws.data.userId!; + + // Fetch user's LLM config to get Groq API key + const configs = await db + .select() + .from(llmConfig) + .where(eq(llmConfig.userId, userId)) + .limit(1); + + if (configs.length === 0 || !configs[0].apiKey) { + sendToDevice(ws, { + type: "transcript_final", + text: "", + }); + break; + } + + handleVoiceStart(ws, deviceId, configs[0].apiKey); + break; + } + + case "voice_chunk": { + const deviceId = ws.data.deviceId!; + handleVoiceChunk(deviceId, (msg as unknown as { data: string }).data); + break; + } + + case "voice_stop": { + const deviceId = ws.data.deviceId!; + const userId = ws.data.userId!; + const action = (msg as unknown as { action: string }).action; + + if (action === "cancel") { + handleVoiceCancel(deviceId); + break; + } + + // action === "send" — finalize and fire goal + const configs = await db + .select() + .from(llmConfig) + .where(eq(llmConfig.userId, userId)) + .limit(1); + + const groqKey = configs[0]?.apiKey ?? ""; + const transcript = await handleVoiceSend(ws, deviceId, groqKey); + + if (transcript) { + // Fire the transcript as a goal — reuse existing goal logic + const persistentDeviceId = ws.data.persistentDeviceId!; + + if (activeSessions.has(deviceId)) { + sendToDevice(ws, { type: "goal_failed", message: "Agent already running" }); + break; + } + + const userLlmConfig: LLMConfig = { + provider: configs[0].provider, + apiKey: configs[0].apiKey, + model: configs[0].model ?? undefined, + }; + + console.log(`[Pipeline] Starting voice goal for device ${deviceId}: ${transcript}`); + const abortController = new AbortController(); + activeSessions.set(deviceId, { goal: transcript, abort: abortController }); + + sendToDevice(ws, { type: "goal_started", sessionId: deviceId, goal: transcript }); + + runPipeline({ + deviceId, + persistentDeviceId, + userId, + goal: transcript, + llmConfig: userLlmConfig, + signal: abortController.signal, + onStep(step) { + sendToDevice(ws, { + type: "step", + step: step.stepNumber, + action: step.action, + reasoning: step.reasoning, + }); + }, + onComplete(result) { + activeSessions.delete(deviceId); + sendToDevice(ws, { + type: "goal_completed", + success: result.success, + stepsUsed: result.stepsUsed, + }); + }, + }).catch((err) => { + activeSessions.delete(deviceId); + sendToDevice(ws, { type: "goal_failed", message: String(err) }); + }); + } + + break; + } +``` + +**Step 3: Verify types compile** + +Run: `cd server && bun run typecheck` +Expected: No errors + +**Step 4: Commit** + +```bash +git add server/src/ws/device.ts +git commit -m "feat(server): wire voice messages into device handler" +``` + +--- + +### Task 4: Android Protocol Models + +**Files:** +- Modify: `android/app/src/main/java/com/thisux/droidclaw/model/Protocol.kt` +- Modify: `android/app/src/main/java/com/thisux/droidclaw/model/AppState.kt` + +**Step 1: Add voice message data classes to Protocol.kt** + +Add after the `StopGoalMessage` class (after line 77): + +```kotlin +@Serializable +data class VoiceStartMessage( + val type: String = "voice_start" +) + +@Serializable +data class VoiceChunkMessage( + val type: String = "voice_chunk", + val data: String // base64-encoded PCM audio +) + +@Serializable +data class VoiceStopMessage( + val type: String = "voice_stop", + val action: String // "send" or "cancel" +) +``` + +**Step 2: Add transcript fields to ServerMessage** + +In the `ServerMessage` data class (line 80-110), add two new fields after `setting`: + +```kotlin + val setting: String? = null, + // Voice transcript fields + val transcript: String? = null +``` + +Note: The `text` field already exists on `ServerMessage` (line 101), but it's used for goal text. We'll use the `type` field to distinguish — `transcript_partial` and `transcript_final` will use the `text` field for the transcript content. + +Actually, the `text` field on `ServerMessage` already works. The `type` field distinguishes the message. No extra field needed. + +**Step 3: Add OverlayMode enum to AppState.kt** + +Check `AppState.kt` for existing enums, then add: + +```kotlin +enum class OverlayMode { + Idle, + Listening, + Executing +} +``` + +**Step 4: Build to verify** + +Run: `cd android && ./gradlew compileDebugKotlin` +Expected: BUILD SUCCESSFUL + +**Step 5: Commit** + +```bash +git add android/app/src/main/java/com/thisux/droidclaw/model/Protocol.kt +git add android/app/src/main/java/com/thisux/droidclaw/model/AppState.kt +git commit -m "feat(android): add voice protocol models and overlay mode enum" +``` + +--- + +### Task 5: Android VoiceRecorder + +**Files:** +- Create: `android/app/src/main/java/com/thisux/droidclaw/overlay/VoiceRecorder.kt` +- Modify: `android/app/src/main/AndroidManifest.xml` + +**Step 1: Add RECORD_AUDIO permission to manifest** + +In `AndroidManifest.xml`, add after the `SYSTEM_ALERT_WINDOW` permission (after line 14): + +```xml + +``` + +**Step 2: Create VoiceRecorder class** + +```kotlin +package com.thisux.droidclaw.overlay + +import android.Manifest +import android.content.Context +import android.content.pm.PackageManager +import android.media.AudioFormat +import android.media.AudioRecord +import android.media.MediaRecorder +import android.util.Base64 +import android.util.Log +import androidx.core.content.ContextCompat +import kotlinx.coroutines.CoroutineScope +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.Job +import kotlinx.coroutines.isActive +import kotlinx.coroutines.launch + +/** + * Captures audio from the microphone and streams base64-encoded PCM chunks. + * + * Audio format: 16kHz, mono, 16-bit PCM (linear16). + * Chunks are emitted every ~100ms via the [onChunk] callback. + */ +class VoiceRecorder( + private val scope: CoroutineScope, + private val onChunk: (base64: String) -> Unit +) { + companion object { + private const val TAG = "VoiceRecorder" + private const val SAMPLE_RATE = 16000 + private const val CHANNEL_CONFIG = AudioFormat.CHANNEL_IN_MONO + private const val AUDIO_FORMAT = AudioFormat.ENCODING_PCM_16BIT + // ~100ms of audio at 16kHz mono 16-bit = 3200 bytes + private const val CHUNK_SIZE = 3200 + } + + private var audioRecord: AudioRecord? = null + private var recordingJob: Job? = null + + val isRecording: Boolean get() = recordingJob?.isActive == true + + fun hasPermission(context: Context): Boolean { + return ContextCompat.checkSelfPermission( + context, Manifest.permission.RECORD_AUDIO + ) == PackageManager.PERMISSION_GRANTED + } + + fun start(): Boolean { + if (isRecording) return false + + val bufferSize = maxOf( + AudioRecord.getMinBufferSize(SAMPLE_RATE, CHANNEL_CONFIG, AUDIO_FORMAT), + CHUNK_SIZE * 2 + ) + + val record = try { + AudioRecord( + MediaRecorder.AudioSource.MIC, + SAMPLE_RATE, + CHANNEL_CONFIG, + AUDIO_FORMAT, + bufferSize + ) + } catch (e: SecurityException) { + Log.e(TAG, "Missing RECORD_AUDIO permission", e) + return false + } + + if (record.state != AudioRecord.STATE_INITIALIZED) { + Log.e(TAG, "AudioRecord failed to initialize") + record.release() + return false + } + + audioRecord = record + record.startRecording() + + recordingJob = scope.launch(Dispatchers.IO) { + val buffer = ByteArray(CHUNK_SIZE) + while (isActive) { + val bytesRead = record.read(buffer, 0, CHUNK_SIZE) + if (bytesRead > 0) { + val base64 = Base64.encodeToString( + buffer.copyOf(bytesRead), + Base64.NO_WRAP + ) + onChunk(base64) + } + } + } + + Log.i(TAG, "Recording started") + return true + } + + fun stop() { + recordingJob?.cancel() + recordingJob = null + audioRecord?.let { + try { + it.stop() + it.release() + } catch (e: Exception) { + Log.w(TAG, "Error stopping AudioRecord", e) + } + } + audioRecord = null + Log.i(TAG, "Recording stopped") + } +} +``` + +**Step 3: Build to verify** + +Run: `cd android && ./gradlew compileDebugKotlin` +Expected: BUILD SUCCESSFUL + +**Step 4: Commit** + +```bash +git add android/app/src/main/AndroidManifest.xml +git add android/app/src/main/java/com/thisux/droidclaw/overlay/VoiceRecorder.kt +git commit -m "feat(android): add VoiceRecorder with AudioRecord PCM streaming" +``` + +--- + +### Task 6: Android Gradient Border Composable + +**Files:** +- Create: `android/app/src/main/java/com/thisux/droidclaw/overlay/GradientBorder.kt` + +**Step 1: Create the animated gradient border composable** + +```kotlin +package com.thisux.droidclaw.overlay + +import androidx.compose.animation.core.LinearEasing +import androidx.compose.animation.core.RepeatMode +import androidx.compose.animation.core.animateFloat +import androidx.compose.animation.core.infiniteRepeatable +import androidx.compose.animation.core.rememberInfiniteTransition +import androidx.compose.animation.core.tween +import androidx.compose.foundation.Canvas +import androidx.compose.foundation.layout.fillMaxSize +import androidx.compose.runtime.Composable +import androidx.compose.runtime.getValue +import androidx.compose.ui.Modifier +import androidx.compose.ui.geometry.Offset +import androidx.compose.ui.geometry.Size +import androidx.compose.ui.graphics.Brush +import androidx.compose.ui.graphics.Color +import androidx.compose.ui.platform.LocalDensity +import androidx.compose.ui.unit.dp + +private val GradientColors = listOf( + Color(0xFF8B5CF6), // purple + Color(0xFF3B82F6), // blue + Color(0xFF06B6D4), // cyan + Color(0xFF10B981), // green + Color(0xFF8B5CF6), // purple (loop) +) + +/** + * Full-screen animated gradient border drawn along all 4 edges. + * The gradient colors rotate over a 3-second cycle. + */ +@Composable +fun GradientBorder() { + val transition = rememberInfiniteTransition(label = "gradientRotation") + val offset by transition.animateFloat( + initialValue = 0f, + targetValue = 1f, + animationSpec = infiniteRepeatable( + animation = tween(durationMillis = 3000, easing = LinearEasing), + repeatMode = RepeatMode.Restart + ), + label = "gradientOffset" + ) + + val borderWidth = with(LocalDensity.current) { 4.dp.toPx() } + + Canvas(modifier = Modifier.fillMaxSize()) { + val w = size.width + val h = size.height + + // Shift colors based on animated offset + val shiftedColors = shiftColors(GradientColors, offset) + + // Top edge + drawRect( + brush = Brush.horizontalGradient(shiftedColors), + topLeft = Offset.Zero, + size = Size(w, borderWidth) + ) + + // Bottom edge + drawRect( + brush = Brush.horizontalGradient(shiftedColors.reversed()), + topLeft = Offset(0f, h - borderWidth), + size = Size(w, borderWidth) + ) + + // Left edge + drawRect( + brush = Brush.verticalGradient(shiftedColors), + topLeft = Offset.Zero, + size = Size(borderWidth, h) + ) + + // Right edge + drawRect( + brush = Brush.verticalGradient(shiftedColors.reversed()), + topLeft = Offset(w - borderWidth, 0f), + size = Size(borderWidth, h) + ) + } +} + +/** + * Rotate the color list by a fractional offset (0..1). + */ +private fun shiftColors(colors: List, offset: Float): List { + if (colors.size < 2) return colors + val n = colors.size + val shift = (offset * n).toInt() % n + return colors.subList(shift, n) + colors.subList(0, shift) +} +``` + +**Step 2: Build to verify** + +Run: `cd android && ./gradlew compileDebugKotlin` +Expected: BUILD SUCCESSFUL + +**Step 3: Commit** + +```bash +git add android/app/src/main/java/com/thisux/droidclaw/overlay/GradientBorder.kt +git commit -m "feat(android): add animated gradient border composable" +``` + +--- + +### Task 7: Android Voice Overlay UI + +**Files:** +- Create: `android/app/src/main/java/com/thisux/droidclaw/overlay/VoiceOverlayContent.kt` + +**Step 1: Create the voice overlay composable** + +This is the interactive bottom panel with transcript text and Send/Cancel buttons. + +```kotlin +package com.thisux.droidclaw.overlay + +import androidx.compose.animation.core.LinearEasing +import androidx.compose.animation.core.RepeatMode +import androidx.compose.animation.core.animateFloat +import androidx.compose.animation.core.infiniteRepeatable +import androidx.compose.animation.core.rememberInfiniteTransition +import androidx.compose.animation.core.tween +import androidx.compose.foundation.background +import androidx.compose.foundation.layout.Arrangement +import androidx.compose.foundation.layout.Box +import androidx.compose.foundation.layout.Column +import androidx.compose.foundation.layout.Row +import androidx.compose.foundation.layout.Spacer +import androidx.compose.foundation.layout.fillMaxSize +import androidx.compose.foundation.layout.fillMaxWidth +import androidx.compose.foundation.layout.height +import androidx.compose.foundation.layout.padding +import androidx.compose.foundation.layout.size +import androidx.compose.foundation.rememberScrollState +import androidx.compose.foundation.shape.CircleShape +import androidx.compose.foundation.shape.RoundedCornerShape +import androidx.compose.foundation.verticalScroll +import androidx.compose.material.icons.Icons +import androidx.compose.material.icons.filled.Close +import androidx.compose.material.icons.filled.Send +import androidx.compose.material3.Button +import androidx.compose.material3.ButtonDefaults +import androidx.compose.material3.Icon +import androidx.compose.material3.OutlinedButton +import androidx.compose.material3.Text +import androidx.compose.runtime.Composable +import androidx.compose.runtime.LaunchedEffect +import androidx.compose.runtime.getValue +import androidx.compose.ui.Alignment +import androidx.compose.ui.Modifier +import androidx.compose.ui.draw.alpha +import androidx.compose.ui.draw.clip +import androidx.compose.ui.graphics.Color +import androidx.compose.ui.text.font.FontWeight +import androidx.compose.ui.text.style.TextAlign +import androidx.compose.ui.unit.dp +import androidx.compose.ui.unit.sp + +private val AccentPurple = Color(0xFF8B5CF6) +private val PanelBackground = Color(0xCC1A1A1A) + +/** + * Voice overlay bottom panel: shows live transcript and Send/Cancel buttons. + */ +@Composable +fun VoiceOverlayContent( + transcript: String, + onSend: () -> Unit, + onCancel: () -> Unit +) { + val scrollState = rememberScrollState() + + // Auto-scroll to bottom when transcript changes + LaunchedEffect(transcript) { + scrollState.animateScrollTo(scrollState.maxValue) + } + + Box( + modifier = Modifier.fillMaxSize(), + contentAlignment = Alignment.BottomCenter + ) { + Column( + modifier = Modifier + .fillMaxWidth() + .clip(RoundedCornerShape(topStart = 24.dp, topEnd = 24.dp)) + .background(PanelBackground) + .padding(24.dp), + horizontalAlignment = Alignment.CenterHorizontally + ) { + // Listening indicator + if (transcript.isEmpty()) { + ListeningIndicator() + Spacer(modifier = Modifier.height(16.dp)) + Text( + text = "Listening...", + color = Color.White.copy(alpha = 0.6f), + fontSize = 16.sp + ) + } else { + // Live transcript text + Text( + text = transcript, + color = Color.White, + fontSize = 24.sp, + fontWeight = FontWeight.Medium, + textAlign = TextAlign.Center, + lineHeight = 32.sp, + modifier = Modifier + .fillMaxWidth() + .height(160.dp) + .verticalScroll(scrollState) + .padding(horizontal = 8.dp) + ) + } + + Spacer(modifier = Modifier.height(24.dp)) + + // Action buttons + Row( + modifier = Modifier.fillMaxWidth(), + horizontalArrangement = Arrangement.spacedBy(16.dp, Alignment.CenterHorizontally) + ) { + OutlinedButton( + onClick = onCancel, + colors = ButtonDefaults.outlinedButtonColors( + contentColor = Color.White.copy(alpha = 0.7f) + ), + modifier = Modifier.weight(1f) + ) { + Icon( + imageVector = Icons.Default.Close, + contentDescription = null, + modifier = Modifier.size(18.dp) + ) + Text(" Cancel", fontSize = 15.sp) + } + + Button( + onClick = onSend, + enabled = transcript.isNotEmpty(), + colors = ButtonDefaults.buttonColors( + containerColor = AccentPurple, + contentColor = Color.White + ), + modifier = Modifier.weight(1f) + ) { + Icon( + imageVector = Icons.Default.Send, + contentDescription = null, + modifier = Modifier.size(18.dp) + ) + Text(" Send", fontSize = 15.sp) + } + } + } + } +} + +/** + * Pulsing dot animation while waiting for speech. + */ +@Composable +private fun ListeningIndicator() { + val transition = rememberInfiniteTransition(label = "listening") + val alpha by transition.animateFloat( + initialValue = 0.3f, + targetValue = 1f, + animationSpec = infiniteRepeatable( + animation = tween(800, easing = LinearEasing), + repeatMode = RepeatMode.Reverse + ), + label = "pulseAlpha" + ) + + Box( + modifier = Modifier + .size(48.dp) + .alpha(alpha) + .clip(CircleShape) + .background(AccentPurple) + ) +} +``` + +**Step 2: Build to verify** + +Run: `cd android && ./gradlew compileDebugKotlin` +Expected: BUILD SUCCESSFUL + +**Step 3: Commit** + +```bash +git add android/app/src/main/java/com/thisux/droidclaw/overlay/VoiceOverlayContent.kt +git commit -m "feat(android): add voice overlay UI with transcript and action buttons" +``` + +--- + +### Task 8: Expand AgentOverlay State Machine + +**Files:** +- Modify: `android/app/src/main/java/com/thisux/droidclaw/overlay/AgentOverlay.kt` + +This is the biggest change. `AgentOverlay` gains: +- An `OverlayMode` state (idle/listening/executing) +- A second full-screen overlay view for the gradient border (non-touchable) +- A third overlay view for the voice panel (touchable) +- `VoiceRecorder` integration +- Methods to transition between modes + +**Step 1: Rewrite AgentOverlay.kt** + +Replace the entire file with the expanded state machine: + +```kotlin +package com.thisux.droidclaw.overlay + +import android.content.Intent +import android.graphics.PixelFormat +import android.view.Gravity +import android.view.MotionEvent +import android.view.View +import android.view.WindowManager +import androidx.compose.runtime.mutableStateOf +import androidx.compose.ui.platform.ComposeView +import androidx.lifecycle.Lifecycle +import androidx.lifecycle.LifecycleService +import androidx.lifecycle.setViewTreeLifecycleOwner +import androidx.savedstate.SavedStateRegistry +import androidx.savedstate.SavedStateRegistryController +import androidx.savedstate.SavedStateRegistryOwner +import androidx.savedstate.setViewTreeSavedStateRegistryOwner +import com.thisux.droidclaw.MainActivity +import com.thisux.droidclaw.model.OverlayMode +import com.thisux.droidclaw.ui.theme.DroidClawTheme + +class AgentOverlay(private val service: LifecycleService) { + + private val windowManager = service.getSystemService(WindowManager::class.java) + + // Compose lifecycle support + private val savedStateOwner = object : SavedStateRegistryOwner { + private val controller = SavedStateRegistryController.create(this) + override val lifecycle: Lifecycle get() = service.lifecycle + override val savedStateRegistry: SavedStateRegistry get() = controller.savedStateRegistry + init { controller.performRestore(null) } + } + + // ── State ─────────────────────────────────────────────── + var mode = mutableStateOf(OverlayMode.Idle) + private set + var transcript = mutableStateOf("") + private set + + // ── Callbacks (set by ConnectionService) ──────────────── + var onVoiceSend: ((String) -> Unit)? = null + var onVoiceCancel: (() -> Unit)? = null + + // ── Views ─────────────────────────────────────────────── + private var pillView: ComposeView? = null + private var borderView: ComposeView? = null + private var voicePanelView: ComposeView? = null + + // ── Voice recorder ────────────────────────────────────── + private var voiceRecorder: VoiceRecorder? = null + var onAudioChunk: ((String) -> Unit)? = null + + // ── Layout params ─────────────────────────────────────── + + private val pillParams = WindowManager.LayoutParams( + WindowManager.LayoutParams.WRAP_CONTENT, + WindowManager.LayoutParams.WRAP_CONTENT, + WindowManager.LayoutParams.TYPE_APPLICATION_OVERLAY, + WindowManager.LayoutParams.FLAG_NOT_FOCUSABLE, + PixelFormat.TRANSLUCENT + ).apply { + gravity = Gravity.TOP or Gravity.START + x = 0 + y = 200 + } + + private val borderParams = WindowManager.LayoutParams( + WindowManager.LayoutParams.MATCH_PARENT, + WindowManager.LayoutParams.MATCH_PARENT, + WindowManager.LayoutParams.TYPE_APPLICATION_OVERLAY, + WindowManager.LayoutParams.FLAG_NOT_TOUCHABLE or + WindowManager.LayoutParams.FLAG_NOT_FOCUSABLE or + WindowManager.LayoutParams.FLAG_LAYOUT_IN_SCREEN, + PixelFormat.TRANSLUCENT + ) + + private val voicePanelParams = WindowManager.LayoutParams( + WindowManager.LayoutParams.MATCH_PARENT, + WindowManager.LayoutParams.MATCH_PARENT, + WindowManager.LayoutParams.TYPE_APPLICATION_OVERLAY, + WindowManager.LayoutParams.FLAG_NOT_FOCUSABLE or + WindowManager.LayoutParams.FLAG_LAYOUT_IN_SCREEN, + PixelFormat.TRANSLUCENT + ) + + // ── Public API ────────────────────────────────────────── + + fun show() { + if (pillView != null) return + showPill() + } + + fun hide() { + hidePill() + hideVoiceOverlay() + } + + fun destroy() { + hide() + voiceRecorder?.stop() + voiceRecorder = null + } + + /** + * Transition to listening mode: hide pill, show gradient border + voice panel, + * start recording audio. + */ + fun startListening() { + mode.value = OverlayMode.Listening + transcript.value = "" + + hidePill() + showVoiceOverlay() + + // Start audio capture + voiceRecorder = VoiceRecorder( + scope = (service as LifecycleService).lifecycleScope, + onChunk = { base64 -> onAudioChunk?.invoke(base64) } + ) + voiceRecorder?.start() + } + + /** + * User tapped Send: stop recording, transition to executing. + */ + fun sendVoice() { + voiceRecorder?.stop() + voiceRecorder = null + mode.value = OverlayMode.Executing + hideVoiceOverlay() + showPill() + onVoiceSend?.invoke(transcript.value) + } + + /** + * User tapped Cancel: stop recording, go back to idle. + */ + fun cancelVoice() { + voiceRecorder?.stop() + voiceRecorder = null + mode.value = OverlayMode.Idle + hideVoiceOverlay() + showPill() + onVoiceCancel?.invoke() + } + + /** + * Update the live transcript text from a partial/final server response. + */ + fun updateTranscript(text: String) { + transcript.value = text + } + + /** + * Return to idle mode (e.g. after goal completion). + */ + fun returnToIdle() { + mode.value = OverlayMode.Idle + } + + // ── Private: Pill overlay ─────────────────────────────── + + private fun showPill() { + if (pillView != null) return + + val view = ComposeView(service).apply { + importantForAccessibility = View.IMPORTANT_FOR_ACCESSIBILITY_NO_HIDE_DESCENDANTS + setViewTreeLifecycleOwner(service) + setViewTreeSavedStateRegistryOwner(savedStateOwner) + setContent { OverlayContent() } + setupDrag(this) + } + + pillView = view + windowManager.addView(view, pillParams) + } + + private fun hidePill() { + pillView?.let { windowManager.removeView(it) } + pillView = null + } + + // ── Private: Voice overlay (border + panel) ───────────── + + private fun showVoiceOverlay() { + if (borderView != null) return + + // Layer 1: gradient border (non-touchable) + val border = ComposeView(service).apply { + importantForAccessibility = View.IMPORTANT_FOR_ACCESSIBILITY_NO_HIDE_DESCENDANTS + setViewTreeLifecycleOwner(service) + setViewTreeSavedStateRegistryOwner(savedStateOwner) + setContent { + DroidClawTheme { GradientBorder() } + } + } + borderView = border + windowManager.addView(border, borderParams) + + // Layer 2: voice panel (touchable) + val panel = ComposeView(service).apply { + importantForAccessibility = View.IMPORTANT_FOR_ACCESSIBILITY_NO_HIDE_DESCENDANTS + setViewTreeLifecycleOwner(service) + setViewTreeSavedStateRegistryOwner(savedStateOwner) + setContent { + DroidClawTheme { + VoiceOverlayContent( + transcript = transcript.value, + onSend = { sendVoice() }, + onCancel = { cancelVoice() } + ) + } + } + } + voicePanelView = panel + windowManager.addView(panel, voicePanelParams) + } + + private fun hideVoiceOverlay() { + borderView?.let { windowManager.removeView(it) } + borderView = null + voicePanelView?.let { windowManager.removeView(it) } + voicePanelView = null + } + + // ── Private: Drag handling for pill ───────────────────── + + private fun setupDrag(view: View) { + var initialX = 0 + var initialY = 0 + var initialTouchX = 0f + var initialTouchY = 0f + var isDragging = false + + view.setOnTouchListener { _, event -> + when (event.action) { + MotionEvent.ACTION_DOWN -> { + initialX = pillParams.x + initialY = pillParams.y + initialTouchX = event.rawX + initialTouchY = event.rawY + isDragging = false + true + } + MotionEvent.ACTION_MOVE -> { + val dx = (event.rawX - initialTouchX).toInt() + val dy = (event.rawY - initialTouchY).toInt() + if (Math.abs(dx) > 10 || Math.abs(dy) > 10) isDragging = true + pillParams.x = initialX + dx + pillParams.y = initialY + dy + windowManager.updateViewLayout(view, pillParams) + true + } + MotionEvent.ACTION_UP -> { + if (!isDragging) { + // Tap on pill: activate voice mode if idle, open app otherwise + if (mode.value == OverlayMode.Idle) { + startListening() + } else { + val intent = Intent(service, MainActivity::class.java).apply { + flags = Intent.FLAG_ACTIVITY_NEW_TASK or + Intent.FLAG_ACTIVITY_SINGLE_TOP or + Intent.FLAG_ACTIVITY_REORDER_TO_FRONT + } + service.startActivity(intent) + } + } + true + } + else -> false + } + } + } +} +``` + +**Step 2: Import lifecycleScope** + +Make sure to add this import at the top: + +```kotlin +import androidx.lifecycle.lifecycleScope +``` + +**Step 3: Build to verify** + +Run: `cd android && ./gradlew compileDebugKotlin` +Expected: BUILD SUCCESSFUL + +**Step 4: Commit** + +```bash +git add android/app/src/main/java/com/thisux/droidclaw/overlay/AgentOverlay.kt +git commit -m "feat(android): expand AgentOverlay with voice mode state machine" +``` + +--- + +### Task 9: Wire Voice into ConnectionService + +**Files:** +- Modify: `android/app/src/main/java/com/thisux/droidclaw/connection/ConnectionService.kt` +- Modify: `android/app/src/main/java/com/thisux/droidclaw/connection/CommandRouter.kt` + +**Step 1: Add overlay voice state flow to ConnectionService companion** + +In `ConnectionService.kt`, add to the companion object (after line 49): + +```kotlin + val overlayTranscript = MutableStateFlow("") +``` + +**Step 2: Wire overlay callbacks in ConnectionService.onCreate()** + +After `overlay = AgentOverlay(this)` (line 68), add: + +```kotlin + overlay?.onAudioChunk = { base64 -> + webSocket?.sendTyped(VoiceChunkMessage(data = base64)) + } + overlay?.onVoiceSend = { _ -> + webSocket?.sendTyped(VoiceStopMessage(action = "send")) + } + overlay?.onVoiceCancel = { + webSocket?.sendTyped(VoiceStopMessage(action = "cancel")) + } +``` + +**Step 3: Send voice_start when overlay enters listening mode** + +In the `overlay?.onAudioChunk` setup, we also need to send `voice_start` when recording begins. Modify `startListening()` call flow. The cleanest approach: add a callback to AgentOverlay that fires on mode transition. But simpler: have ConnectionService observe the overlay and send `voice_start`. + +Add after the callbacks above: + +```kotlin + // When overlay starts listening, notify server + overlay?.let { ov -> + lifecycleScope.launch { + snapshotFlow { ov.mode.value }.collect { mode -> + when (mode) { + OverlayMode.Listening -> { + webSocket?.sendTyped(VoiceStartMessage()) + } + else -> {} + } + } + } + } +``` + +**Step 4: Handle transcript messages in CommandRouter** + +In `CommandRouter.kt`, add cases in the `when (msg.type)` block (before line 74): + +```kotlin + "transcript_partial" -> { + ConnectionService.overlayTranscript.value = msg.text ?: "" + val overlay = ConnectionService.instance?.overlay + overlay?.updateTranscript(msg.text ?: "") + Log.d(TAG, "Transcript partial: ${msg.text}") + } + "transcript_final" -> { + ConnectionService.overlayTranscript.value = msg.text ?: "" + val overlay = ConnectionService.instance?.overlay + overlay?.updateTranscript(msg.text ?: "") + Log.d(TAG, "Transcript final: ${msg.text}") + } +``` + +**Step 5: Make overlay accessible from ConnectionService for CommandRouter** + +The `overlay` field in `ConnectionService` is currently `private`. We need to expose it. In `ConnectionService.kt`, change: + +```kotlin + private var overlay: AgentOverlay? = null +``` + +to: + +```kotlin + internal var overlay: AgentOverlay? = null +``` + +**Step 6: Add required imports** + +In `ConnectionService.kt`, add: + +```kotlin +import com.thisux.droidclaw.model.VoiceStartMessage +import com.thisux.droidclaw.model.VoiceChunkMessage +import com.thisux.droidclaw.model.VoiceStopMessage +import com.thisux.droidclaw.model.OverlayMode +import androidx.compose.runtime.snapshotFlow +``` + +In `CommandRouter.kt`, add: + +```kotlin +import com.thisux.droidclaw.connection.ConnectionService +``` + +**Step 7: Return overlay to idle when goal completes** + +In `CommandRouter.kt`, in the `"goal_completed"` case (line 65-68), add: + +```kotlin + "goal_completed" -> { + currentGoalStatus.value = if (msg.success == true) GoalStatus.Completed else GoalStatus.Failed + ConnectionService.instance?.overlay?.returnToIdle() + Log.i(TAG, "Goal completed: success=${msg.success}, steps=${msg.stepsUsed}") + } +``` + +**Step 8: Build to verify** + +Run: `cd android && ./gradlew compileDebugKotlin` +Expected: BUILD SUCCESSFUL + +**Step 9: Commit** + +```bash +git add android/app/src/main/java/com/thisux/droidclaw/connection/ConnectionService.kt +git add android/app/src/main/java/com/thisux/droidclaw/connection/CommandRouter.kt +git commit -m "feat(android): wire voice recording and transcript into ConnectionService" +``` + +--- + +### Task 10: Runtime Permission Handling + +**Files:** +- Modify: `android/app/src/main/java/com/thisux/droidclaw/overlay/AgentOverlay.kt` + +**Step 1: Add permission check before starting voice mode** + +In `AgentOverlay.startListening()`, add a permission check at the top: + +```kotlin + fun startListening() { + // Check audio permission first + val recorder = VoiceRecorder( + scope = (service as LifecycleService).lifecycleScope, + onChunk = { base64 -> onAudioChunk?.invoke(base64) } + ) + if (!recorder.hasPermission(service)) { + // Open the app to request permission + val intent = Intent(service, MainActivity::class.java).apply { + flags = Intent.FLAG_ACTIVITY_NEW_TASK or + Intent.FLAG_ACTIVITY_SINGLE_TOP + putExtra("request_audio_permission", true) + } + service.startActivity(intent) + return + } + + mode.value = OverlayMode.Listening + transcript.value = "" + + hidePill() + showVoiceOverlay() + + voiceRecorder = recorder + voiceRecorder?.start() + } +``` + +**Step 2: Handle permission request in MainActivity** + +In `MainActivity.kt`, check for the extra in `onCreate()` or `onNewIntent()` and request `RECORD_AUDIO` permission using the standard Activity result API. This is a one-time setup — once granted, the overlay will work directly. + +Add a permission launcher in `MainActivity`: + +```kotlin + private val audioPermissionLauncher = registerForActivityResult( + ActivityResultContracts.RequestPermission() + ) { granted -> + if (granted) { + // Permission granted, user can now tap overlay to start voice + // No action needed — they'll tap the pill again + } + } +``` + +In `onCreate()` or `onNewIntent()`, check: + +```kotlin + override fun onNewIntent(intent: Intent) { + super.onNewIntent(intent) + if (intent.getBooleanExtra("request_audio_permission", false)) { + audioPermissionLauncher.launch(Manifest.permission.RECORD_AUDIO) + } + } +``` + +**Step 3: Build to verify** + +Run: `cd android && ./gradlew compileDebugKotlin` +Expected: BUILD SUCCESSFUL + +**Step 4: Commit** + +```bash +git add android/app/src/main/java/com/thisux/droidclaw/overlay/AgentOverlay.kt +git add android/app/src/main/java/com/thisux/droidclaw/MainActivity.kt +git commit -m "feat(android): add RECORD_AUDIO runtime permission handling" +``` + +--- + +### Task 11: Integration Test — Manual Verification + +**No code changes. Manual testing checklist.** + +**Step 1: Build and install** + +```bash +cd android && ./gradlew installDebug +``` + +**Step 2: Start server** + +```bash +cd server && bun run dev +``` + +**Step 3: Test the flow** + +1. Open DroidClaw app → connect to server +2. Go to home screen (app in background) → floating pill visible +3. Tap the pill → gradient border should appear around screen edges +4. Speak a command → transcript text should appear live on the overlay +5. Tap Send → pill reappears showing agent progress +6. Agent executes the goal +7. Pill returns to "Ready" + +**Step 4: Test cancel flow** + +1. Tap pill → voice mode activates +2. Speak something +3. Tap Cancel → pill reappears, no goal fired + +**Step 5: Test permission flow** + +1. Revoke RECORD_AUDIO permission in Android settings +2. Tap pill → should open app with permission request dialog +3. Grant permission → tap pill again → voice mode should work + +**Step 6: Final commit (if any fixes needed)** + +```bash +git add -A +git commit -m "fix: integration fixes for voice overlay" +```