feat(agent): add pipeline mode with dynamic prompts to agent loop

When pipelineMode is enabled in AgentLoopOptions, the loop uses
buildDynamicPrompt() with per-screen context (editable fields,
scrollable elements, app hints, stuck state) instead of the static
mega-prompt. Legacy mode (default) is unchanged.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Sanju Sivalingam
2026-02-18 00:24:25 +05:30
parent 3f389c5de6
commit 18b8509081

View File

@@ -21,9 +21,11 @@ import { sessions } from "../ws/sessions.js";
import { import {
getLlmProvider, getLlmProvider,
getSystemPrompt, getSystemPrompt,
buildDynamicPrompt,
parseJsonResponse, parseJsonResponse,
type LLMConfig, type LLMConfig,
} from "./llm.js"; } from "./llm.js";
import { formatAppHints } from "./hints.js";
import { createStuckDetector } from "./stuck.js"; import { createStuckDetector } from "./stuck.js";
import { db } from "../db.js"; import { db } from "../db.js";
import { agentSession, agentStep, device as deviceTable } from "../schema.js"; import { agentSession, agentStep, device as deviceTable } from "../schema.js";
@@ -42,6 +44,8 @@ export interface AgentLoopOptions {
originalGoal?: string; originalGoal?: string;
llmConfig: LLMConfig; llmConfig: LLMConfig;
maxSteps?: number; maxSteps?: number;
/** If true, use dynamic prompts instead of mega-prompt (pipeline mode) */
pipelineMode?: boolean;
/** Abort signal for cancellation */ /** Abort signal for cancellation */
signal?: AbortSignal; signal?: AbortSignal;
onStep?: (step: AgentStep) => void; onStep?: (step: AgentStep) => void;
@@ -172,6 +176,8 @@ function actionToCommand(
return { return {
type: "launch", type: "launch",
packageName: action.package ?? "", packageName: action.package ?? "",
intentUri: action.uri,
intentExtras: action.extras,
}; };
case "clear": case "clear":
@@ -199,11 +205,21 @@ function actionToCommand(
return { type: "keyevent", code: action.code ?? 0 }; return { type: "keyevent", code: action.code ?? 0 };
case "open_settings": case "open_settings":
return { type: "open_settings" }; return { type: "open_settings", setting: action.setting };
case "wait": case "wait":
return { type: "wait", duration: 2000 }; return { type: "wait", duration: 2000 };
case "intent":
return {
type: "intent",
intentAction: action.intentAction,
intentUri: action.uri,
intentType: action.intentType,
intentExtras: action.extras,
packageName: action.package,
};
case "done": case "done":
return { type: "done" }; return { type: "done" };
@@ -234,7 +250,10 @@ export async function runAgentLoop(
const sessionId = crypto.randomUUID(); const sessionId = crypto.randomUUID();
const llm = getLlmProvider(llmConfig); const llm = getLlmProvider(llmConfig);
const stuck = createStuckDetector(); const stuck = createStuckDetector();
const systemPrompt = getSystemPrompt();
// Use legacy prompt if not in pipeline mode (backward compat)
const useDynamicPrompt = options.pipelineMode ?? false;
const legacySystemPrompt = useDynamicPrompt ? "" : getSystemPrompt();
let prevElements: UIElement[] = []; let prevElements: UIElement[] = [];
let lastScreenHash = ""; let lastScreenHash = "";
@@ -252,11 +271,16 @@ export async function runAgentLoop(
.where(eq(deviceTable.id, persistentDeviceId)) .where(eq(deviceTable.id, persistentDeviceId))
.limit(1); .limit(1);
const info = rows[0]?.info as Record<string, unknown> | null; const info = rows[0]?.info as Record<string, unknown> | null;
const apps = info?.installedApps as Array<{ packageName: string; label: string }> | undefined; const apps = info?.installedApps as Array<{ packageName: string; label: string; intents?: string[] }> | undefined;
if (apps && apps.length > 0) { if (apps && apps.length > 0) {
// Build app list with intent capabilities
const appLines = apps.map((a) => {
const intents = a.intents?.length ? ` [${a.intents.join(", ")}]` : "";
return ` ${a.label}: ${a.packageName}${intents}`;
});
installedAppsContext = installedAppsContext =
`\nINSTALLED_APPS (use exact packageName for "launch" action):\n` + `\nINSTALLED_APPS (use exact packageName for "launch", use intents in [] for "intent" action):\n` +
apps.map((a) => ` ${a.label}: ${a.packageName}`).join("\n") + appLines.join("\n") +
"\n"; "\n";
} }
} catch { } catch {
@@ -391,7 +415,24 @@ export async function runAgentLoop(
useScreenshot = true; useScreenshot = true;
} }
// ── 4. Build user prompt ──────────────────────────────── // ── 4. Build prompts ──────────────────────────────────────
let systemPrompt: string;
if (useDynamicPrompt) {
const hasEditableFields = elements.some((e) => e.editable);
const hasScrollable = elements.some((e) => e.scrollable);
const appHintsStr = packageName ? formatAppHints(packageName) : "";
systemPrompt = buildDynamicPrompt({
hasEditableFields,
hasScrollable,
foregroundApp: packageName,
appHints: appHintsStr,
isStuck: stuck.isStuck(),
});
} else {
systemPrompt = legacySystemPrompt;
}
const foregroundLine = packageName const foregroundLine = packageName
? `FOREGROUND_APP: ${packageName}\n\n` ? `FOREGROUND_APP: ${packageName}\n\n`
: ""; : "";
@@ -402,15 +443,15 @@ export async function runAgentLoop(
let userPrompt = let userPrompt =
`GOAL: ${goal}\n\n` + `GOAL: ${goal}\n\n` +
`STEP: ${step + 1}/${maxSteps}\n\n` + `STEP: ${step + 1}/${maxSteps}\n\n` +
installedAppsContext + (useDynamicPrompt ? "" : installedAppsContext) +
foregroundLine + foregroundLine +
actionFeedbackLine + actionFeedbackLine +
`SCREEN_CONTEXT:\n${JSON.stringify(elements, null, 2)}` + `SCREEN_CONTEXT:\n${JSON.stringify(elements, null, 2)}` +
diffContext + diffContext +
visionContext; visionContext;
// Add stuck recovery hint from detector // Add stuck recovery hint from detector (only for legacy mode; dynamic prompt handles it)
if (stuck.isStuck()) { if (!useDynamicPrompt && stuck.isStuck()) {
userPrompt += "\n\n" + stuck.getRecoveryHint(); userPrompt += "\n\n" + stuck.getRecoveryHint();
} }
@@ -433,11 +474,26 @@ export async function runAgentLoop(
continue; continue;
} }
// ── 6. Parse response ─────────────────────────────────── // ── 6. Parse response (retry once on failure) ─────────
const parsed = parseJsonResponse(rawResponse); let parsed = parseJsonResponse(rawResponse);
if (!parsed || !parsed.action) {
console.warn(
`[Agent ${sessionId}] Parse failed at step ${step + 1}, retrying LLM. Raw: ${rawResponse.slice(0, 200)}`
);
try {
rawResponse = await llm.getAction(
systemPrompt,
userPrompt + "\n\nIMPORTANT: Your previous response was not valid JSON. You MUST respond with ONLY a valid JSON object.",
useScreenshot ? screenshot : undefined
);
parsed = parseJsonResponse(rawResponse);
} catch {
// retry failed, fall through
}
}
if (!parsed || !parsed.action) { if (!parsed || !parsed.action) {
console.error( console.error(
`[Agent ${sessionId}] Failed to parse LLM response at step ${step + 1}` `[Agent ${sessionId}] Failed to parse LLM response at step ${step + 1}. Raw: ${rawResponse.slice(0, 300)}`
); );
stuck.recordAction("parse_error", screenHash); stuck.recordAction("parse_error", screenHash);
lastActionFeedback = "parse_error -> FAILED: Could not parse LLM response"; lastActionFeedback = "parse_error -> FAILED: Could not parse LLM response";