From db995e4913430fa27f6eda121ecde28f6787938d Mon Sep 17 00:00:00 2001 From: Sanju Sivalingam Date: Wed, 18 Feb 2026 00:53:13 +0530 Subject: [PATCH] fix(agent): prevent stuck loop by adding action history to LLM prompt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The UI agent had no memory of previous actions — each step was a fresh single-shot LLM call. After typing and sending a message, the LLM saw an empty text field and retyped the message in a loop. - Add RECENT_ACTIONS (last 5 actions with text/result) to user prompt - Add chat app completion detection rule to dynamic prompt - Add send-success hints for WhatsApp and Messages apps - Add git convention to CLAUDE.md (no co-author lines) --- CLAUDE.md | 4 ++++ server/src/agent/hints.ts | 2 ++ server/src/agent/llm.ts | 2 ++ server/src/agent/loop.ts | 18 ++++++++++++++++++ 4 files changed, 26 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index d9aacd0..9c2e673 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -63,3 +63,7 @@ Copy `.env.example` to `.env` and configure `LLM_PROVIDER` + the corresponding A ## Device Assumptions Swipe coordinates in `constants.ts` are hardcoded for 1080px-wide screens (center X=540, center Y=1200). Adjust `SWIPE_COORDS` and `SCREEN_CENTER_*` for different resolutions. + +## Git Conventions + +- Do NOT add `Co-Authored-By: Claude` lines to commit messages. diff --git a/server/src/agent/hints.ts b/server/src/agent/hints.ts index cce0bca..29bed87 100644 --- a/server/src/agent/hints.ts +++ b/server/src/agent/hints.ts @@ -13,6 +13,7 @@ const APP_HINTS: Record = { ], "com.whatsapp": [ "To send a message, tap the green arrow/send button — do NOT use 'enter' key.", + "SEND SUCCESS: After tapping send, the text field clears and your message appears as a chat bubble. If the field is EMPTY and you see your message in the conversation — it was sent. Say done.", "New chat: tap the green floating button (bottom-right), then search for contact.", "Media: tap the + or paperclip icon to attach files/images.", ], @@ -43,6 +44,7 @@ const APP_HINTS: Record = { "New message: tap the floating button with + or pencil icon.", "To field: type the contact name or number, then select from suggestions.", "Send: tap the arrow/send icon, not Enter.", + "SEND SUCCESS: After send, if the text field is EMPTY and your message appears in the conversation, it was sent. Say done.", ], "com.google.android.dialer": [ "Dial pad: tap the floating phone icon if dial pad isn't visible.", diff --git a/server/src/agent/llm.ts b/server/src/agent/llm.ts index de93888..b4fb172 100644 --- a/server/src/agent/llm.ts +++ b/server/src/agent/llm.ts @@ -327,6 +327,8 @@ RULES: - If SCREEN_CHANGE says "NOT changed", your last action had no effect — change strategy. - Do NOT repeat an action that already failed. - Say "done" as soon as the goal is achieved. +- CHECK RECENT_ACTIONS before every step: if you already typed text and tapped send, do NOT type it again. +- CHAT APP COMPLETION: After typing a message and tapping send in a chat app (WhatsApp, Messages, etc.), if the text field is now EMPTY and your message text appears in the conversation above, the message was SENT SUCCESSFULLY. Say "done" immediately. - COPY-PASTE: Use clipboard_set with text from SCREEN_CONTEXT (most reliable), then paste. Or just type directly.`; if (isStuck) { diff --git a/server/src/agent/loop.ts b/server/src/agent/loop.ts index 03bc77d..8f80dc4 100644 --- a/server/src/agent/loop.ts +++ b/server/src/agent/loop.ts @@ -260,6 +260,7 @@ export async function runAgentLoop( let stuckCount = 0; const recentActions: string[] = []; let lastActionFeedback = ""; + const actionHistory: string[] = []; // Human-readable log of recent actions // Fetch installed apps from device metadata for LLM context let installedAppsContext = ""; @@ -440,12 +441,18 @@ export async function runAgentLoop( ? `LAST_ACTION_RESULT: ${lastActionFeedback}\n\n` : ""; + // Include recent action history so LLM knows what it already did + const historyContext = actionHistory.length > 0 + ? `RECENT_ACTIONS (what you already did — do NOT repeat completed work):\n${actionHistory.map((h) => ` ${h}`).join("\n")}\n\n` + : ""; + let userPrompt = `GOAL: ${goal}\n\n` + `STEP: ${step + 1}/${maxSteps}\n\n` + (useDynamicPrompt ? "" : installedAppsContext) + foregroundLine + actionFeedbackLine + + historyContext + `SCREEN_CONTEXT:\n${JSON.stringify(elements, null, 2)}` + diffContext + visionContext; @@ -510,6 +517,13 @@ export async function runAgentLoop( recentActions.push(actionSig); if (recentActions.length > 8) recentActions.shift(); + // Build human-readable history entry for LLM context + const historyParts = [`Step ${step + 1}: ${actionSig}`]; + if (action.text) historyParts.push(`text="${action.text}"`); + if (action.reason) historyParts.push(`— ${action.reason}`); + actionHistory.push(historyParts.join(" ")); + if (actionHistory.length > 5) actionHistory.shift(); + // ── 7. Log + Done check ──────────────────────────────── const reason = action.reason ?? ""; console.log(`[Agent ${sessionId}] Step ${step + 1}: ${actionSig} — ${reason}`); @@ -565,6 +579,10 @@ export async function runAgentLoop( const resultSuccess = result.success !== false; lastActionFeedback = `${actionSig} -> ${resultSuccess ? "OK" : "FAILED"}: ${result.error ?? result.data ?? "completed"}`; console.log(`[Agent ${sessionId}] Step ${step + 1} result: ${lastActionFeedback}`); + // Append result to last history entry + if (actionHistory.length > 0) { + actionHistory[actionHistory.length - 1] += ` → ${resultSuccess ? "OK" : "FAILED"}`; + } // Update step result in DB if (persistentDeviceId) { db.update(agentStep).set({ result: lastActionFeedback }).where(eq(agentStep.id, stepId))