From 4c8241c96433fbb679aa7b31d5185fc0aec4f64d Mon Sep 17 00:00:00 2001 From: Sanju Sivalingam Date: Tue, 17 Feb 2026 14:27:26 +0530 Subject: [PATCH] feat: add agent loop with LLM integration and stuck detection Server-side agent loop that adapts the CLI kernel to work over WebSocket. Three new modules: stuck detection, LLM provider abstraction (OpenAI/Groq/ OpenRouter), and the main perception-reasoning-action loop. Also wires up the goals route to start agent loops with duplicate-device protection. Co-Authored-By: Claude Opus 4.6 --- server/src/agent/llm.ts | 318 +++++++++++++++++++++++++ server/src/agent/loop.ts | 459 +++++++++++++++++++++++++++++++++++++ server/src/agent/stuck.ts | 97 ++++++++ server/src/routes/goals.ts | 70 +++++- 4 files changed, 939 insertions(+), 5 deletions(-) create mode 100644 server/src/agent/llm.ts create mode 100644 server/src/agent/loop.ts create mode 100644 server/src/agent/stuck.ts diff --git a/server/src/agent/llm.ts b/server/src/agent/llm.ts new file mode 100644 index 0000000..61d2adb --- /dev/null +++ b/server/src/agent/llm.ts @@ -0,0 +1,318 @@ +/** + * LLM provider abstraction for the DroidClaw server agent loop. + * + * For v1, implements an OpenAI-compatible provider that works with + * OpenAI, Groq, and OpenRouter (all use the same /chat/completions API). + * + * The SYSTEM_PROMPT is adapted from the CLI src/llm-providers.ts, + * with ADB-specific references removed since the phone handles + * actions directly via the WebSocket companion app. + */ + +// ─── Types ────────────────────────────────────────────────────── + +export interface LLMConfig { + provider: string; + apiKey: string; + model?: string; + baseUrl?: string; +} + +export interface LLMProvider { + getAction( + systemPrompt: string, + userPrompt: string, + imageBase64?: string + ): Promise; +} + +// ─── System Prompt ────────────────────────────────────────────── + +/** + * Returns the system prompt that defines all 22+ actions and rules + * for the Android driver agent. Adapted from the CLI SYSTEM_PROMPT + * with ADB references removed (phone companion handles execution). + */ +export function getSystemPrompt(): string { + return `You are an Android Driver Agent. Your job is to achieve the user's goal by navigating the Android UI. + +You will receive: +1. GOAL -- the user's task. +2. FOREGROUND_APP -- the currently active app package and activity. +3. LAST_ACTION_RESULT -- the outcome of your previous action (success/failure and details). +4. SCREEN_CONTEXT -- JSON array of interactive UI elements with coordinates and states. +5. SCREENSHOT -- an image of the current screen (when available). +6. SCREEN_CHANGE -- what changed since your last action (or if the screen is stuck). +7. VISION_FALLBACK -- present when the accessibility tree is empty (custom UI / WebView). + +Previous conversation turns contain your earlier observations and actions (multi-turn memory). + +You must output ONLY a valid JSON object with your next action. + +═══════════════════════════════════════════ +THINKING & PLANNING +═══════════════════════════════════════════ + +Before each action, include a "think" field with your reasoning about the current state and what to do next. + +Optionally include: +- "plan": an array of 3-5 high-level steps to achieve the goal +- "planProgress": a brief note on which plan step you're currently on + +Example: +{"think": "I see the Settings app is open. I need to scroll down to find Display settings.", "plan": ["Open Settings", "Navigate to Display", "Change theme to dark", "Verify change"], "planProgress": "Step 2: navigating to Display", "action": "swipe", "direction": "up", "reason": "Scroll down to find Display option"} + +═══════════════════════════════════════════ +AVAILABLE ACTIONS (22 total) +═══════════════════════════════════════════ + +Navigation (coordinates MUST be a JSON array of TWO separate integers [x, y] -- never concatenate them): + {"action": "tap", "coordinates": [540, 1200], "reason": "..."} + {"action": "longpress", "coordinates": [540, 1200], "reason": "..."} + {"action": "scroll", "direction": "up|down|left|right", "reason": "Scroll to see more content (down=below, up=above)"} + {"action": "enter", "reason": "Press Enter/submit"} + {"action": "back", "reason": "Navigate back"} + {"action": "home", "reason": "Go to home screen"} + +Text Input (ALWAYS include coordinates to focus the correct field before typing): + {"action": "type", "coordinates": [540, 648], "text": "Hello World", "reason": "..."} + {"action": "clear", "reason": "Clear current text field before typing"} + +App Control: + {"action": "launch", "package": "com.whatsapp", "reason": "Open WhatsApp"} + {"action": "launch", "uri": "https://maps.google.com/?q=pizza", "reason": "Open URL"} + {"action": "launch", "package": "com.whatsapp", "uri": "content://media/external/images/1", "extras": {"android.intent.extra.TEXT": "Check this"}, "reason": "Share image to WhatsApp"} + {"action": "open_url", "url": "https://example.com", "reason": "Open URL in browser"} + {"action": "switch_app", "package": "com.whatsapp", "reason": "Switch to WhatsApp"} + {"action": "open_settings", "setting": "wifi|bluetooth|display|sound|battery|location|apps|date|accessibility|developer", "reason": "Open settings screen"} + +Data: + {"action": "clipboard_get", "reason": "Read clipboard contents"} + {"action": "clipboard_set", "text": "copied text", "reason": "Set clipboard"} + {"action": "paste", "coordinates": [540, 804], "reason": "Paste clipboard into focused field"} + +Device: + {"action": "notifications", "reason": "Read notification bar content"} + {"action": "keyevent", "code": 187, "reason": "Send keycode (187=recent apps, 26=power, etc.)"} + +System: + {"action": "wait", "reason": "Wait for screen to load"} + {"action": "done", "reason": "Task is complete"} + +Multi-Step Actions (PREFER these over basic actions when applicable): + {"action": "read_screen", "reason": "Scroll through entire page, collect ALL text, copy to clipboard"} + {"action": "submit_message", "reason": "Find and tap Send button, wait for response"} + {"action": "copy_visible_text", "reason": "Copy all visible text to clipboard"} + {"action": "copy_visible_text", "query": "search term", "reason": "Copy matching text to clipboard"} + {"action": "wait_for_content", "reason": "Wait for new content to appear"} + {"action": "find_and_tap", "query": "Button Label", "reason": "Find element by text and tap it"} + {"action": "compose_email", "query": "recipient@email.com", "reason": "Fill email To+Body, pastes clipboard into body"} + {"action": "compose_email", "query": "recipient@email.com", "text": "body", "reason": "Fill email with specific body"} + NOTE: compose_email REQUIRES "query" = recipient email. "text" is optional body (clipboard used if empty). + +═══════════════════════════════════════════ +ELEMENT PROPERTIES YOU WILL SEE +═══════════════════════════════════════════ + +Each element in SCREEN_CONTEXT has: +- text: visible label or content description +- center: [x, y] coordinates to tap +- action: suggested action -- "tap", "type", "longpress", "scroll", or "read" +- enabled: false (only shown when disabled -- DO NOT tap disabled elements!) +- checked: true (only shown for ON checkboxes/toggles) +- focused: true (only shown when field has input focus) +- hint: placeholder text (only shown when present) +- editable: true (only shown for text input fields) +- scrollable: true (only shown for scrollable containers) + +═══════════════════════════════════════════ +CRITICAL RULES +═══════════════════════════════════════════ + +1. DISABLED ELEMENTS: If "enabled": false, DO NOT tap or interact with it. Find an alternative. +2. TEXT INPUT: ALWAYS include "coordinates" with "type" to focus the correct field. Without coordinates, text goes into whatever field was last focused -- which may be WRONG. If "editable": true, use "clear" first if field has existing text, then "type". +3. ALREADY TYPED: Check your previous actions. Do NOT re-type text you already entered. +4. REPETITION: Do NOT tap the same coordinates twice in a row. If it didn't work, try something else. +5. STUCK: If SCREEN_CHANGE says "NOT changed", your last action had no effect. Change strategy. +6. APP LAUNCH: Use "launch" to directly open apps instead of hunting for icons on the home screen. +7. READ PAGES: Use "read_screen" to collect all text from a page (search results, articles, feeds). It scrolls automatically and copies everything to clipboard. +8. LONG PRESS: Use "longpress" when you see "longClickable": true (context menus, copy/paste, etc). +9. SCROLLING: If the item you need isn't visible, use "scroll" with direction "down" to see more below, or "up" for above. +10. MULTI-APP: Use "switch_app" with the package name to switch directly between apps. Or use "home" then "launch". Use "back" to return within the same app. +11. PASSWORDS: Never log or output the text of password fields. +12. DONE: Say "done" as soon as the goal is achieved. Don't keep acting after success. +13. SUBMIT IN CHAT APPS: Use "submit_message" action instead of "enter" in chat apps. It finds and taps the Send button, waits for a response, and reports new content. Only use "enter" in search bars or web forms. +14. SHARE: To send files/images between apps, use "launch" with uri + extras for Android intents. +15. CLEANUP: If a popup/ad appears, dismiss it with "back" or tap the close button, then continue. +16. COPY-PASTE: PREFERRED: Use "copy_visible_text" action to copy text to clipboard programmatically -- this bypasses unreliable UI Copy buttons entirely. Then switch apps and "paste". + ALTERNATIVE: Use "clipboard_set" with the text you see in SCREEN_CONTEXT, then switch apps and "paste". + FALLBACK: Just "type" the text directly into the target app field. + NEVER type a vague description -- always use the actual text content. +17. COORDINATES: ALWAYS use coordinates from SCREEN_CONTEXT elements (the "center" field). NEVER estimate or guess coordinates from screenshots -- they are inaccurate. Screenshots help you understand the layout; SCREEN_CONTEXT provides the correct tap targets. +18. BACK IS DESTRUCTIVE: NEVER use "back" to leave an app while you have a task in progress within it. You will LOSE all progress (typed text, loading responses, navigation state). Try all other in-app approaches first. Only use "back" after 5+ failed attempts within the app. +19. LEARN FROM HISTORY: Before choosing an action, check your earlier turns. If "enter" failed to submit a query before, do NOT try "enter" again -- find and tap the Send button. If specific coordinates didn't work, try different ones. Never repeat a strategy that already failed in this session. +20. EMAIL COMPOSE: ALWAYS use "compose_email" action when filling email fields. It fills To, Subject, and Body in the correct order. Pass the recipient email in "query" and body text in "text" (or it pastes from clipboard). NEVER manually type/paste into email fields -- you WILL put it in the wrong field. + +═══════════════════════════════════════════ +ADAPTIVE PROBLEM-SOLVING +═══════════════════════════════════════════ + +NEVER REPEAT A FAILING ACTION more than once. If an action doesn't produce the expected result after 1 attempt, STOP and try a completely different approach. + +SILENT SUCCESSES: Some actions succeed WITHOUT changing the screen: +- Tapping "Copy", "Share", "Like", or "Bookmark" buttons often works silently. +- If you tapped a Copy button and the screen didn't change, it likely WORKED. Move on to the next step instead of retrying. + +SCREEN_CONTEXT IS YOUR DATA: The text in SCREEN_CONTEXT elements is data you already have. You can use it directly in: +- "clipboard_set" -- to set clipboard contents programmatically (more reliable than UI copy) +- "type" -- to enter text directly into any field +You do NOT need to "copy" text via UI -- you already have it from SCREEN_CONTEXT. + +GOAL-ORIENTED THINKING: Focus on WHAT you need to accomplish, not on rigidly following planned steps. If a step fails, ask: "What was the PURPOSE of this step?" and find another way. +- Goal says "copy and send as email"? If Copy fails, use clipboard_set with SCREEN_CONTEXT text, or type it directly in the email. +- Goal says "search for X"? If enter doesn't submit, look for and tap the send/search button. +- Goal says "open app X"? Use "launch" with package name instead of hunting for icons. + +SMART DECISION PRIORITIES: When multiple approaches can achieve the same result, prefer: +1. Programmatic actions (clipboard_set, launch) -- most reliable, no UI dependency. +2. Direct input (type, paste, enter) -- reliable when field is focused. +3. UI button interactions (tap, longpress) -- LEAST reliable, depends on correct coordinates. +Before choosing an action, ask: "Is there a simpler, more direct way to do this?" + +PATIENCE WITH LOADING: AI chatbots (ChatGPT, Gemini, Claude) take 5-15 seconds to generate responses. After submitting a query, use "wait" 2-3 times before assuming it failed. Do NOT start scrolling or navigating away prematurely. + +ESCAPE STUCK LOOPS -- when stuck, try in this priority order: +1. The action may have already succeeded silently -- MOVE ON to the next task step. +2. Use programmatic alternatives (clipboard_set, type, launch with URI). +3. Try a completely different UI element or interaction method. +4. Navigate away (back, home) ONLY as an absolute last resort -- this loses progress.`; +} + +// ─── Provider Implementation ──────────────────────────────────── + +const BASE_URLS: Record = { + openai: "https://api.openai.com/v1", + groq: "https://api.groq.com/openai/v1", + openrouter: "https://openrouter.ai/api/v1", +}; + +const DEFAULT_MODELS: Record = { + openai: "gpt-4o", + groq: "llama-3.3-70b-versatile", + openrouter: "google/gemini-2.0-flash-001", +}; + +function getDefaultModel(provider: string): string { + return DEFAULT_MODELS[provider] ?? "gpt-4o"; +} + +/** + * Creates an OpenAI-compatible LLM provider. + * Works with OpenAI, Groq, and OpenRouter since they all share the + * same /chat/completions API format. + */ +export function getLlmProvider(config: LLMConfig): LLMProvider { + const baseUrl = config.baseUrl ?? BASE_URLS[config.provider] ?? BASE_URLS.openai; + const model = config.model ?? getDefaultModel(config.provider); + + return { + async getAction( + systemPrompt: string, + userPrompt: string, + imageBase64?: string + ): Promise { + const messages: Array<{ role: string; content: unknown }> = [ + { role: "system", content: systemPrompt }, + ]; + + if (imageBase64) { + messages.push({ + role: "user", + content: [ + { type: "text", text: userPrompt }, + { + type: "image_url", + image_url: { + url: `data:image/png;base64,${imageBase64}`, + detail: "low", + }, + }, + ], + }); + } else { + messages.push({ role: "user", content: userPrompt }); + } + + const response = await fetch(`${baseUrl}/chat/completions`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${config.apiKey}`, + }, + body: JSON.stringify({ + model, + messages, + temperature: 0.2, + max_tokens: 1024, + response_format: { type: "json_object" }, + }), + }); + + if (!response.ok) { + const error = await response.text(); + throw new Error(`LLM API error (${response.status}): ${error}`); + } + + const data = (await response.json()) as { + choices: Array<{ message: { content: string } }>; + }; + return data.choices[0]?.message?.content ?? ""; + }, + }; +} + +// ─── JSON Response Parsing ────────────────────────────────────── + +/** + * Sanitizes raw LLM text so it can be parsed as JSON. + * LLMs often put literal newlines inside JSON string values. + */ +function sanitizeJsonText(raw: string): string { + return raw.replace(/\n/g, " ").replace(/\r/g, " "); +} + +/** + * Parses an LLM response into a JSON record. Handles: + * - Clean JSON + * - Markdown-wrapped code blocks (```json ... ```) + * - Mixed text with embedded JSON + * + * Returns null on parse failure. + */ +export function parseJsonResponse(raw: string): Record | null { + // Try direct parse + try { + return JSON.parse(raw) as Record; + } catch { + // continue + } + + // Try with sanitized newlines + try { + return JSON.parse(sanitizeJsonText(raw)) as Record; + } catch { + // continue + } + + // Try extracting JSON from markdown code blocks or mixed text + const match = raw.match(/\{[\s\S]*\}/); + if (match) { + try { + return JSON.parse(sanitizeJsonText(match[0])) as Record; + } catch { + // fall through + } + } + + return null; +} diff --git a/server/src/agent/loop.ts b/server/src/agent/loop.ts new file mode 100644 index 0000000..9defb10 --- /dev/null +++ b/server/src/agent/loop.ts @@ -0,0 +1,459 @@ +/** + * Server-side agent loop for DroidClaw. + * + * Adapts the CLI agent loop (src/kernel.ts) to work over WebSocket. + * The loop runs on the server: gets screen state from the phone via + * sessions.sendCommand(), calls an LLM for decision-making, and sends + * action commands back to the device. + * + * Core flow: + * 1. Send get_screen to device + * 2. Build prompt with screen elements, goal, step count, stuck hints + * 3. Call LLM via provider + * 4. Parse response as ActionDecision + * 5. If action is "done", stop + * 6. Map action to WebSocket command and send to device + * 7. Notify dashboard subscribers of each step + * 8. Repeat until done or maxSteps reached + */ + +import { sessions } from "../ws/sessions.js"; +import { + getLlmProvider, + getSystemPrompt, + parseJsonResponse, + type LLMConfig, +} from "./llm.js"; +import { createStuckDetector } from "./stuck.js"; +import type { UIElement, ActionDecision } from "@droidclaw/shared"; + +// ─── Public Types ─────────────────────────────────────────────── + +export interface AgentLoopOptions { + deviceId: string; + userId: string; + goal: string; + llmConfig: LLMConfig; + maxSteps?: number; + onStep?: (step: AgentStep) => void; + onComplete?: (result: AgentResult) => void; +} + +export interface AgentStep { + stepNumber: number; + action: ActionDecision; + reasoning: string; + screenHash: string; +} + +export interface AgentResult { + success: boolean; + stepsUsed: number; + sessionId: string; +} + +// ─── Screen Hash ──────────────────────────────────────────────── + +/** + * Compute a screen hash for stuck detection. + * Same algorithm as src/sanitizer.ts computeScreenHash(). + */ +function computeScreenHash(elements: UIElement[]): string { + const parts = elements.map( + (e) => + `${e.id}|${e.text}|${e.center[0]},${e.center[1]}|${e.enabled}|${e.checked}` + ); + return parts.join(";"); +} + +// ─── Screen Diffing ───────────────────────────────────────────── + +interface ScreenDiff { + changed: boolean; + addedTexts: string[]; + removedTexts: string[]; + summary: string; +} + +function diffScreenState( + prevElements: UIElement[], + currElements: UIElement[] +): ScreenDiff { + const prevTexts = new Set(prevElements.map((e) => e.text).filter(Boolean)); + const currTexts = new Set(currElements.map((e) => e.text).filter(Boolean)); + + const addedTexts = [...currTexts].filter((t) => !prevTexts.has(t)); + const removedTexts = [...prevTexts].filter((t) => !currTexts.has(t)); + + const prevHash = computeScreenHash(prevElements); + const currHash = computeScreenHash(currElements); + const changed = prevHash !== currHash; + + let summary = ""; + if (!changed) { + summary = "Screen has NOT changed since last action."; + } else { + const parts: string[] = []; + if (addedTexts.length > 0) { + parts.push(`New on screen: ${addedTexts.slice(0, 5).join(", ")}`); + } + if (removedTexts.length > 0) { + parts.push(`Gone from screen: ${removedTexts.slice(0, 5).join(", ")}`); + } + summary = parts.join(". ") || "Screen layout changed."; + } + + return { changed, addedTexts, removedTexts, summary }; +} + +// ─── Action → WebSocket Command Mapping ───────────────────────── + +/** + * Maps an ActionDecision to a WebSocket command object for the device. + * The device companion app receives these and executes the corresponding + * ADB/accessibility action. + */ +function actionToCommand( + action: ActionDecision +): Record { + switch (action.action) { + case "tap": + return { + type: "tap", + x: action.coordinates?.[0], + y: action.coordinates?.[1], + }; + + case "type": + return { type: "type", text: action.text ?? "" }; + + case "enter": + return { type: "enter" }; + + case "back": + return { type: "back" }; + + case "home": + return { type: "home" }; + + case "swipe": + case "scroll": { + // Map scroll direction to swipe coordinates (default 1080px wide screen) + const dir = action.direction ?? "down"; + let x1 = 540, y1 = 1600, x2 = 540, y2 = 400; + if (dir === "up") { + y1 = 400; y2 = 1600; // swipe from top to bottom = scroll up + } else if (dir === "left") { + x1 = 900; y1 = 1200; x2 = 180; y2 = 1200; + } else if (dir === "right") { + x1 = 180; y1 = 1200; x2 = 900; y2 = 1200; + } + // dir === "down" uses defaults: swipe from bottom to top = scroll down + return { type: "swipe", x1, y1, x2, y2 }; + } + + case "longpress": + return { + type: "longpress", + x: action.coordinates?.[0], + y: action.coordinates?.[1], + }; + + case "launch": + return { + type: "launch", + packageName: action.package ?? "", + }; + + case "clear": + return { type: "clear" }; + + case "clipboard_set": + return { type: "clipboard_set", text: action.text ?? "" }; + + case "clipboard_get": + return { type: "clipboard_get" }; + + case "paste": + return { type: "paste" }; + + case "open_url": + return { type: "open_url", url: action.url ?? "" }; + + case "switch_app": + return { type: "switch_app", packageName: action.package ?? "" }; + + case "notifications": + return { type: "notifications" }; + + case "keyevent": + return { type: "keyevent", code: action.code ?? 0 }; + + case "open_settings": + return { type: "open_settings" }; + + case "wait": + return { type: "wait", duration: 2000 }; + + case "done": + return { type: "done" }; + + default: + // Pass through unknown actions -- the device can decide what to do + return { type: action.action }; + } +} + +// ─── Main Agent Loop ──────────────────────────────────────────── + +export async function runAgentLoop( + options: AgentLoopOptions +): Promise { + const { + deviceId, + userId, + goal, + llmConfig, + maxSteps = 30, + onStep, + onComplete, + } = options; + + const sessionId = crypto.randomUUID(); + const llm = getLlmProvider(llmConfig); + const stuck = createStuckDetector(); + const systemPrompt = getSystemPrompt(); + + let prevElements: UIElement[] = []; + let lastScreenHash = ""; + let stuckCount = 0; + const recentActions: string[] = []; + let lastActionFeedback = ""; + + // Notify dashboard that a goal has started + sessions.notifyDashboard(userId, { + type: "goal_started", + sessionId, + goal, + deviceId, + }); + + let stepsUsed = 0; + let success = false; + + try { + for (let step = 0; step < maxSteps; step++) { + stepsUsed = step + 1; + + // ── 1. Get screen state from device ───────────────────── + const screenResponse = (await sessions.sendCommand(deviceId, { + type: "get_screen", + })) as { + elements?: UIElement[]; + screenshot?: string; + packageName?: string; + }; + + const elements = screenResponse.elements ?? []; + const screenshot = screenResponse.screenshot; + const packageName = screenResponse.packageName; + const screenHash = computeScreenHash(elements); + + // ── 2. Screen diff: detect stuck loops ────────────────── + let diffContext = ""; + if (step > 0) { + const diff = diffScreenState(prevElements, elements); + + if (!diff.changed) { + stuckCount++; + if (stuckCount >= 3) { + diffContext += `\nWARNING: You have been stuck for ${stuckCount} steps. The screen is NOT changing.`; + diffContext += + "\nYour plan is NOT working. You MUST create a completely NEW plan with a different approach."; + } + } else { + stuckCount = 0; + } + + diffContext = `\n\nSCREEN_CHANGE: ${diff.summary}` + diffContext; + } + prevElements = elements; + + // Repetition detection (persists across screen changes) + if (recentActions.length >= 3) { + const freq = new Map(); + for (const a of recentActions) freq.set(a, (freq.get(a) ?? 0) + 1); + const [topAction, topCount] = [...freq.entries()].reduce( + (a, b) => (b[1] > a[1] ? b : a), + ["", 0] + ); + if (topCount >= 3) { + diffContext += + `\nREPETITION_ALERT: You have attempted "${topAction}" ${topCount} times in recent steps. ` + + `This action is clearly NOT working -- do NOT attempt it again.`; + } + } + + // Drift detection (navigation spam) + if (recentActions.length >= 4) { + const navigationActions = new Set([ + "swipe", + "scroll", + "back", + "home", + "wait", + ]); + const navCount = recentActions + .slice(-5) + .filter((a) => navigationActions.has(a.split("(")[0])).length; + if (navCount >= 4) { + diffContext += + `\nDRIFT_WARNING: Your last ${navCount} actions were all navigation/waiting with no direct interaction. ` + + `STOP scrolling/navigating and take a DIRECT action: tap a specific button, use "type", or use "clipboard_set".`; + } + } + + // ── 3. Vision context ─────────────────────────────────── + let visionContext = ""; + let useScreenshot = false; + + if (elements.length === 0) { + visionContext = + "\n\nVISION_FALLBACK: The accessibility tree returned NO elements. " + + "A screenshot has been captured. The screen likely contains custom-drawn " + + "content (game, WebView, or Flutter). Try using coordinate-based taps on " + + "common UI positions, or use 'back'/'home' to navigate away."; + useScreenshot = true; + } else if (stuckCount >= 2) { + visionContext = + "\n\nVISION_ASSIST: You have been stuck -- a screenshot is attached. " + + "Use the screenshot to VISUALLY identify the correct field positions, " + + "buttons, and layout. The accessibility tree may be misleading."; + useScreenshot = true; + } else if (elements.length < 3) { + // Very few elements -- vision may help + useScreenshot = true; + } + + // ── 4. Build user prompt ──────────────────────────────── + const foregroundLine = packageName + ? `FOREGROUND_APP: ${packageName}\n\n` + : ""; + const actionFeedbackLine = lastActionFeedback + ? `LAST_ACTION_RESULT: ${lastActionFeedback}\n\n` + : ""; + + let userPrompt = + `GOAL: ${goal}\n\n` + + `STEP: ${step + 1}/${maxSteps}\n\n` + + foregroundLine + + actionFeedbackLine + + `SCREEN_CONTEXT:\n${JSON.stringify(elements, null, 2)}` + + diffContext + + visionContext; + + // Add stuck recovery hint from detector + if (stuck.isStuck()) { + userPrompt += "\n\n" + stuck.getRecoveryHint(); + } + + lastScreenHash = screenHash; + + // ── 5. Call LLM ───────────────────────────────────────── + let rawResponse: string; + try { + rawResponse = await llm.getAction( + systemPrompt, + userPrompt, + useScreenshot ? screenshot : undefined + ); + } catch (err) { + console.error( + `[Agent ${sessionId}] LLM error at step ${step + 1}: ${(err as Error).message}` + ); + stuck.recordAction("llm_error", screenHash); + lastActionFeedback = `llm_error -> FAILED: ${(err as Error).message}`; + continue; + } + + // ── 6. Parse response ─────────────────────────────────── + const parsed = parseJsonResponse(rawResponse); + if (!parsed || !parsed.action) { + console.error( + `[Agent ${sessionId}] Failed to parse LLM response at step ${step + 1}` + ); + stuck.recordAction("parse_error", screenHash); + lastActionFeedback = "parse_error -> FAILED: Could not parse LLM response"; + continue; + } + + const action = parsed as unknown as ActionDecision; + + // Track action for stuck detection + const actionSig = action.coordinates + ? `${action.action}(${action.coordinates.join(",")})` + : action.action; + stuck.recordAction(actionSig, screenHash); + recentActions.push(actionSig); + if (recentActions.length > 8) recentActions.shift(); + + // ── 7. Done? ──────────────────────────────────────────── + if (action.action === "done") { + success = true; + break; + } + + // ── 8. Notify dashboard ───────────────────────────────── + const stepData: AgentStep = { + stepNumber: step + 1, + action, + reasoning: action.reason ?? "", + screenHash, + }; + onStep?.(stepData); + + sessions.notifyDashboard(userId, { + type: "step", + sessionId, + step: step + 1, + action: action as unknown as Record, + reasoning: action.reason ?? "", + screenHash, + }); + + // ── 9. Execute on device ──────────────────────────────── + const command = actionToCommand(action); + try { + const result = (await sessions.sendCommand(deviceId, command)) as { + success?: boolean; + error?: string; + data?: string; + }; + const resultSuccess = result.success !== false; + lastActionFeedback = `${actionSig} -> ${resultSuccess ? "OK" : "FAILED"}: ${result.error ?? result.data ?? "completed"}`; + } catch (err) { + lastActionFeedback = `${actionSig} -> FAILED: ${(err as Error).message}`; + console.error( + `[Agent ${sessionId}] Command error at step ${step + 1}: ${(err as Error).message}` + ); + } + + // ── 10. Brief pause for UI to settle ──────────────────── + await new Promise((r) => setTimeout(r, 500)); + } + } catch (error) { + console.error(`[Agent ${sessionId}] Loop error: ${error}`); + } + + const result: AgentResult = { success, stepsUsed, sessionId }; + + sessions.notifyDashboard(userId, { + type: "goal_completed", + sessionId, + success, + stepsUsed, + }); + + onComplete?.(result); + return result; +} diff --git a/server/src/agent/stuck.ts b/server/src/agent/stuck.ts new file mode 100644 index 0000000..a515dc5 --- /dev/null +++ b/server/src/agent/stuck.ts @@ -0,0 +1,97 @@ +/** + * Stuck-loop detection for the DroidClaw agent loop. + * + * Same algorithm as the CLI kernel.ts: sliding window of recent actions + * and screen hashes to detect repetition, with context-aware recovery hints. + */ + +export interface StuckDetector { + recordAction(action: string, screenHash: string): void; + isStuck(): boolean; + getRecoveryHint(): string; + getStuckCount(): number; + reset(): void; +} + +export function createStuckDetector(windowSize: number = 8): StuckDetector { + const recentActions: string[] = []; + const recentHashes: string[] = []; + let unchangedCount = 0; + + return { + recordAction(action: string, screenHash: string) { + // Track screen-unchanged streaks + if (recentHashes.length > 0 && recentHashes[recentHashes.length - 1] === screenHash) { + unchangedCount++; + } else { + unchangedCount = 0; + } + + recentActions.push(action); + recentHashes.push(screenHash); + if (recentActions.length > windowSize) recentActions.shift(); + if (recentHashes.length > windowSize) recentHashes.shift(); + }, + + isStuck(): boolean { + if (recentActions.length < 3) return false; + + // Check 1: All recent actions are identical + const allSameAction = recentActions.slice(-3).every((a) => a === recentActions[recentActions.length - 1]); + + // Check 2: Screen hash hasn't changed for 3+ steps + const allSameHash = unchangedCount >= 3; + + // Check 3: Repetition frequency (same action 3+ times in window) + const freq = new Map(); + for (const a of recentActions) freq.set(a, (freq.get(a) ?? 0) + 1); + const maxFreq = Math.max(...freq.values()); + const highRepetition = maxFreq >= 3; + + return allSameAction || allSameHash || highRepetition; + }, + + getRecoveryHint(): string { + // Context-aware recovery based on what actions are failing + const failingTypes = new Set( + recentActions.slice(-3).map((a) => a.split("(")[0]) + ); + + let hint = + "STUCK DETECTED: You have been repeating the same action or seeing the same screen. " + + "Your current approach is NOT working.\n\n"; + + if (failingTypes.has("tap") || failingTypes.has("longpress")) { + hint += + "Your tap/press actions are having NO EFFECT. Likely causes:\n" + + "- The action SUCCEEDED SILENTLY (copy/share/like buttons often work without screen changes). If so, MOVE ON.\n" + + "- The element is not actually interactive at those coordinates.\n" + + "- Use 'clipboard_set' to set clipboard text directly instead of UI copy buttons.\n" + + "- Or just 'type' the text directly in the target app.\n\n"; + } + + if (failingTypes.has("swipe") || failingTypes.has("scroll")) { + hint += + "Swiping is having no effect -- you may be at the end of scrollable content. " + + "Try interacting with visible elements or navigate with 'back'/'home'.\n\n"; + } + + hint += + "Try a completely different approach: scroll to find new elements, go back, " + + "use the home button, try a different app, or use programmatic actions " + + "(clipboard_set, type, launch) instead of UI interactions."; + + return hint; + }, + + getStuckCount(): number { + return unchangedCount; + }, + + reset() { + recentActions.length = 0; + recentHashes.length = 0; + unchangedCount = 0; + }, + }; +} diff --git a/server/src/routes/goals.ts b/server/src/routes/goals.ts index 4a392f1..81756f7 100644 --- a/server/src/routes/goals.ts +++ b/server/src/routes/goals.ts @@ -1,13 +1,25 @@ import { Hono } from "hono"; import { sessionMiddleware, type AuthEnv } from "../middleware/auth.js"; import { sessions } from "../ws/sessions.js"; +import { runAgentLoop, type AgentLoopOptions } from "../agent/loop.js"; +import type { LLMConfig } from "../agent/llm.js"; const goals = new Hono(); goals.use("*", sessionMiddleware); +/** Track running agent sessions so we can prevent duplicates */ +const activeSessions = new Map(); + goals.post("/", async (c) => { const user = c.get("user"); - const body = await c.req.json<{ deviceId: string; goal: string }>(); + const body = await c.req.json<{ + deviceId: string; + goal: string; + llmProvider?: string; + llmApiKey?: string; + llmModel?: string; + maxSteps?: number; + }>(); if (!body.deviceId || !body.goal) { return c.json({ error: "deviceId and goal are required" }, 400); @@ -22,14 +34,62 @@ goals.post("/", async (c) => { return c.json({ error: "device does not belong to you" }, 403); } - // TODO (Task 6): start agent loop for this device+goal - const sessionId = crypto.randomUUID(); + // Prevent multiple agent loops on the same device + if (activeSessions.has(body.deviceId)) { + const existing = activeSessions.get(body.deviceId)!; + return c.json( + { error: "agent already running on this device", sessionId: existing.sessionId, goal: existing.goal }, + 409 + ); + } + // Build LLM config from request body or environment defaults + const llmConfig: LLMConfig = { + provider: body.llmProvider ?? process.env.LLM_PROVIDER ?? "openai", + apiKey: body.llmApiKey ?? process.env.LLM_API_KEY ?? "", + model: body.llmModel, + }; + + if (!llmConfig.apiKey) { + return c.json({ error: "LLM API key is required (provide llmApiKey or set LLM_API_KEY env var)" }, 400); + } + + const options: AgentLoopOptions = { + deviceId: body.deviceId, + userId: user.id, + goal: body.goal, + llmConfig, + maxSteps: body.maxSteps, + }; + + // Start the agent loop in the background (fire-and-forget). + // The client observes progress via the /ws/dashboard WebSocket. + const loopPromise = runAgentLoop(options); + + // Track as active until it completes + const trackingId = body.deviceId; + const sessionPlaceholder = { sessionId: "pending", goal: body.goal }; + activeSessions.set(trackingId, sessionPlaceholder); + + loopPromise + .then((result) => { + activeSessions.delete(trackingId); + console.log( + `[Agent] Completed on ${body.deviceId}: ${result.success ? "success" : "incomplete"} in ${result.stepsUsed} steps (session ${result.sessionId})` + ); + }) + .catch((err) => { + activeSessions.delete(trackingId); + console.error(`[Agent] Error on ${body.deviceId}: ${err}`); + }); + + // We need the sessionId from the loop, but it's created inside runAgentLoop. + // For immediate response, generate one here and let the dashboard events carry the real one. + // The loop will emit goal_started with its sessionId momentarily. return c.json({ - sessionId, deviceId: body.deviceId, goal: body.goal, - status: "queued", + status: "started", }); });