From db995e4913430fa27f6eda121ecde28f6787938d Mon Sep 17 00:00:00 2001
From: Sanju Sivalingam <spikeysanju98@gmail.com>
Date: Wed, 18 Feb 2026 00:53:13 +0530
Subject: [PATCH] fix(agent): prevent stuck loop by adding action history to
 LLM prompt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The UI agent had no memory of previous actions — each step was a fresh
single-shot LLM call. After typing and sending a message, the LLM saw
an empty text field and retyped the message in a loop.

- Add RECENT_ACTIONS (last 5 actions with text/result) to user prompt
- Add chat app completion detection rule to dynamic prompt
- Add send-success hints for WhatsApp and Messages apps
- Add git convention to CLAUDE.md (no co-author lines)
---
 CLAUDE.md                 |  4 ++++
 server/src/agent/hints.ts |  2 ++
 server/src/agent/llm.ts   |  2 ++
 server/src/agent/loop.ts  | 18 ++++++++++++++++++
 4 files changed, 26 insertions(+)

diff --git a/CLAUDE.md b/CLAUDE.md
index d9aacd0..9c2e673 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -63,3 +63,7 @@ Copy `.env.example` to `.env` and configure `LLM_PROVIDER` + the corresponding A
 ## Device Assumptions
 
 Swipe coordinates in `constants.ts` are hardcoded for 1080px-wide screens (center X=540, center Y=1200). Adjust `SWIPE_COORDS` and `SCREEN_CENTER_*` for different resolutions.
+
+## Git Conventions
+
+- Do NOT add `Co-Authored-By: Claude` lines to commit messages.
diff --git a/server/src/agent/hints.ts b/server/src/agent/hints.ts
index cce0bca..29bed87 100644
--- a/server/src/agent/hints.ts
+++ b/server/src/agent/hints.ts
@@ -13,6 +13,7 @@ const APP_HINTS: Record<string, string[]> = {
   ],
   "com.whatsapp": [
     "To send a message, tap the green arrow/send button — do NOT use 'enter' key.",
+    "SEND SUCCESS: After tapping send, the text field clears and your message appears as a chat bubble. If the field is EMPTY and you see your message in the conversation — it was sent. Say done.",
     "New chat: tap the green floating button (bottom-right), then search for contact.",
     "Media: tap the + or paperclip icon to attach files/images.",
   ],
@@ -43,6 +44,7 @@ const APP_HINTS: Record<string, string[]> = {
     "New message: tap the floating button with + or pencil icon.",
     "To field: type the contact name or number, then select from suggestions.",
     "Send: tap the arrow/send icon, not Enter.",
+    "SEND SUCCESS: After send, if the text field is EMPTY and your message appears in the conversation, it was sent. Say done.",
   ],
   "com.google.android.dialer": [
     "Dial pad: tap the floating phone icon if dial pad isn't visible.",
diff --git a/server/src/agent/llm.ts b/server/src/agent/llm.ts
index de93888..b4fb172 100644
--- a/server/src/agent/llm.ts
+++ b/server/src/agent/llm.ts
@@ -327,6 +327,8 @@ RULES:
 - If SCREEN_CHANGE says "NOT changed", your last action had no effect — change strategy.
 - Do NOT repeat an action that already failed.
 - Say "done" as soon as the goal is achieved.
+- CHECK RECENT_ACTIONS before every step: if you already typed text and tapped send, do NOT type it again.
+- CHAT APP COMPLETION: After typing a message and tapping send in a chat app (WhatsApp, Messages, etc.), if the text field is now EMPTY and your message text appears in the conversation above, the message was SENT SUCCESSFULLY. Say "done" immediately.
 - COPY-PASTE: Use clipboard_set with text from SCREEN_CONTEXT (most reliable), then paste. Or just type directly.`;
 
   if (isStuck) {
diff --git a/server/src/agent/loop.ts b/server/src/agent/loop.ts
index 03bc77d..8f80dc4 100644
--- a/server/src/agent/loop.ts
+++ b/server/src/agent/loop.ts
@@ -260,6 +260,7 @@ export async function runAgentLoop(
   let stuckCount = 0;
   const recentActions: string[] = [];
   let lastActionFeedback = "";
+  const actionHistory: string[] = []; // Human-readable log of recent actions
 
   // Fetch installed apps from device metadata for LLM context
   let installedAppsContext = "";
@@ -440,12 +441,18 @@ export async function runAgentLoop(
         ? `LAST_ACTION_RESULT: ${lastActionFeedback}\n\n`
         : "";
 
+      // Include recent action history so LLM knows what it already did
+      const historyContext = actionHistory.length > 0
+        ? `RECENT_ACTIONS (what you already did — do NOT repeat completed work):\n${actionHistory.map((h) => `  ${h}`).join("\n")}\n\n`
+        : "";
+
       let userPrompt =
         `GOAL: ${goal}\n\n` +
         `STEP: ${step + 1}/${maxSteps}\n\n` +
         (useDynamicPrompt ? "" : installedAppsContext) +
         foregroundLine +
         actionFeedbackLine +
+        historyContext +
         `SCREEN_CONTEXT:\n${JSON.stringify(elements, null, 2)}` +
         diffContext +
         visionContext;
@@ -510,6 +517,13 @@ export async function runAgentLoop(
       recentActions.push(actionSig);
       if (recentActions.length > 8) recentActions.shift();
 
+      // Build human-readable history entry for LLM context
+      const historyParts = [`Step ${step + 1}: ${actionSig}`];
+      if (action.text) historyParts.push(`text="${action.text}"`);
+      if (action.reason) historyParts.push(`— ${action.reason}`);
+      actionHistory.push(historyParts.join(" "));
+      if (actionHistory.length > 5) actionHistory.shift();
+
       // ── 7. Log + Done check ────────────────────────────────
       const reason = action.reason ?? "";
       console.log(`[Agent ${sessionId}] Step ${step + 1}: ${actionSig} — ${reason}`);
@@ -565,6 +579,10 @@ export async function runAgentLoop(
         const resultSuccess = result.success !== false;
         lastActionFeedback = `${actionSig} -> ${resultSuccess ? "OK" : "FAILED"}: ${result.error ?? result.data ?? "completed"}`;
         console.log(`[Agent ${sessionId}] Step ${step + 1} result: ${lastActionFeedback}`);
+        // Append result to last history entry
+        if (actionHistory.length > 0) {
+          actionHistory[actionHistory.length - 1] += ` → ${resultSuccess ? "OK" : "FAILED"}`;
+        }
         // Update step result in DB
         if (persistentDeviceId) {
           db.update(agentStep).set({ result: lastActionFeedback }).where(eq(agentStep.id, stepId))