feat(agent): add pipeline mode with dynamic prompts to agent loop
When pipelineMode is enabled in AgentLoopOptions, the loop uses buildDynamicPrompt() with per-screen context (editable fields, scrollable elements, app hints, stuck state) instead of the static mega-prompt. Legacy mode (default) is unchanged. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -21,9 +21,11 @@ import { sessions } from "../ws/sessions.js";
|
|||||||
import {
|
import {
|
||||||
getLlmProvider,
|
getLlmProvider,
|
||||||
getSystemPrompt,
|
getSystemPrompt,
|
||||||
|
buildDynamicPrompt,
|
||||||
parseJsonResponse,
|
parseJsonResponse,
|
||||||
type LLMConfig,
|
type LLMConfig,
|
||||||
} from "./llm.js";
|
} from "./llm.js";
|
||||||
|
import { formatAppHints } from "./hints.js";
|
||||||
import { createStuckDetector } from "./stuck.js";
|
import { createStuckDetector } from "./stuck.js";
|
||||||
import { db } from "../db.js";
|
import { db } from "../db.js";
|
||||||
import { agentSession, agentStep, device as deviceTable } from "../schema.js";
|
import { agentSession, agentStep, device as deviceTable } from "../schema.js";
|
||||||
@@ -42,6 +44,8 @@ export interface AgentLoopOptions {
|
|||||||
originalGoal?: string;
|
originalGoal?: string;
|
||||||
llmConfig: LLMConfig;
|
llmConfig: LLMConfig;
|
||||||
maxSteps?: number;
|
maxSteps?: number;
|
||||||
|
/** If true, use dynamic prompts instead of mega-prompt (pipeline mode) */
|
||||||
|
pipelineMode?: boolean;
|
||||||
/** Abort signal for cancellation */
|
/** Abort signal for cancellation */
|
||||||
signal?: AbortSignal;
|
signal?: AbortSignal;
|
||||||
onStep?: (step: AgentStep) => void;
|
onStep?: (step: AgentStep) => void;
|
||||||
@@ -172,6 +176,8 @@ function actionToCommand(
|
|||||||
return {
|
return {
|
||||||
type: "launch",
|
type: "launch",
|
||||||
packageName: action.package ?? "",
|
packageName: action.package ?? "",
|
||||||
|
intentUri: action.uri,
|
||||||
|
intentExtras: action.extras,
|
||||||
};
|
};
|
||||||
|
|
||||||
case "clear":
|
case "clear":
|
||||||
@@ -199,11 +205,21 @@ function actionToCommand(
|
|||||||
return { type: "keyevent", code: action.code ?? 0 };
|
return { type: "keyevent", code: action.code ?? 0 };
|
||||||
|
|
||||||
case "open_settings":
|
case "open_settings":
|
||||||
return { type: "open_settings" };
|
return { type: "open_settings", setting: action.setting };
|
||||||
|
|
||||||
case "wait":
|
case "wait":
|
||||||
return { type: "wait", duration: 2000 };
|
return { type: "wait", duration: 2000 };
|
||||||
|
|
||||||
|
case "intent":
|
||||||
|
return {
|
||||||
|
type: "intent",
|
||||||
|
intentAction: action.intentAction,
|
||||||
|
intentUri: action.uri,
|
||||||
|
intentType: action.intentType,
|
||||||
|
intentExtras: action.extras,
|
||||||
|
packageName: action.package,
|
||||||
|
};
|
||||||
|
|
||||||
case "done":
|
case "done":
|
||||||
return { type: "done" };
|
return { type: "done" };
|
||||||
|
|
||||||
@@ -234,7 +250,10 @@ export async function runAgentLoop(
|
|||||||
const sessionId = crypto.randomUUID();
|
const sessionId = crypto.randomUUID();
|
||||||
const llm = getLlmProvider(llmConfig);
|
const llm = getLlmProvider(llmConfig);
|
||||||
const stuck = createStuckDetector();
|
const stuck = createStuckDetector();
|
||||||
const systemPrompt = getSystemPrompt();
|
|
||||||
|
// Use legacy prompt if not in pipeline mode (backward compat)
|
||||||
|
const useDynamicPrompt = options.pipelineMode ?? false;
|
||||||
|
const legacySystemPrompt = useDynamicPrompt ? "" : getSystemPrompt();
|
||||||
|
|
||||||
let prevElements: UIElement[] = [];
|
let prevElements: UIElement[] = [];
|
||||||
let lastScreenHash = "";
|
let lastScreenHash = "";
|
||||||
@@ -252,11 +271,16 @@ export async function runAgentLoop(
|
|||||||
.where(eq(deviceTable.id, persistentDeviceId))
|
.where(eq(deviceTable.id, persistentDeviceId))
|
||||||
.limit(1);
|
.limit(1);
|
||||||
const info = rows[0]?.info as Record<string, unknown> | null;
|
const info = rows[0]?.info as Record<string, unknown> | null;
|
||||||
const apps = info?.installedApps as Array<{ packageName: string; label: string }> | undefined;
|
const apps = info?.installedApps as Array<{ packageName: string; label: string; intents?: string[] }> | undefined;
|
||||||
if (apps && apps.length > 0) {
|
if (apps && apps.length > 0) {
|
||||||
|
// Build app list with intent capabilities
|
||||||
|
const appLines = apps.map((a) => {
|
||||||
|
const intents = a.intents?.length ? ` [${a.intents.join(", ")}]` : "";
|
||||||
|
return ` ${a.label}: ${a.packageName}${intents}`;
|
||||||
|
});
|
||||||
installedAppsContext =
|
installedAppsContext =
|
||||||
`\nINSTALLED_APPS (use exact packageName for "launch" action):\n` +
|
`\nINSTALLED_APPS (use exact packageName for "launch", use intents in [] for "intent" action):\n` +
|
||||||
apps.map((a) => ` ${a.label}: ${a.packageName}`).join("\n") +
|
appLines.join("\n") +
|
||||||
"\n";
|
"\n";
|
||||||
}
|
}
|
||||||
} catch {
|
} catch {
|
||||||
@@ -391,7 +415,24 @@ export async function runAgentLoop(
|
|||||||
useScreenshot = true;
|
useScreenshot = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── 4. Build user prompt ────────────────────────────────
|
// ── 4. Build prompts ──────────────────────────────────────
|
||||||
|
let systemPrompt: string;
|
||||||
|
if (useDynamicPrompt) {
|
||||||
|
const hasEditableFields = elements.some((e) => e.editable);
|
||||||
|
const hasScrollable = elements.some((e) => e.scrollable);
|
||||||
|
const appHintsStr = packageName ? formatAppHints(packageName) : "";
|
||||||
|
|
||||||
|
systemPrompt = buildDynamicPrompt({
|
||||||
|
hasEditableFields,
|
||||||
|
hasScrollable,
|
||||||
|
foregroundApp: packageName,
|
||||||
|
appHints: appHintsStr,
|
||||||
|
isStuck: stuck.isStuck(),
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
systemPrompt = legacySystemPrompt;
|
||||||
|
}
|
||||||
|
|
||||||
const foregroundLine = packageName
|
const foregroundLine = packageName
|
||||||
? `FOREGROUND_APP: ${packageName}\n\n`
|
? `FOREGROUND_APP: ${packageName}\n\n`
|
||||||
: "";
|
: "";
|
||||||
@@ -402,15 +443,15 @@ export async function runAgentLoop(
|
|||||||
let userPrompt =
|
let userPrompt =
|
||||||
`GOAL: ${goal}\n\n` +
|
`GOAL: ${goal}\n\n` +
|
||||||
`STEP: ${step + 1}/${maxSteps}\n\n` +
|
`STEP: ${step + 1}/${maxSteps}\n\n` +
|
||||||
installedAppsContext +
|
(useDynamicPrompt ? "" : installedAppsContext) +
|
||||||
foregroundLine +
|
foregroundLine +
|
||||||
actionFeedbackLine +
|
actionFeedbackLine +
|
||||||
`SCREEN_CONTEXT:\n${JSON.stringify(elements, null, 2)}` +
|
`SCREEN_CONTEXT:\n${JSON.stringify(elements, null, 2)}` +
|
||||||
diffContext +
|
diffContext +
|
||||||
visionContext;
|
visionContext;
|
||||||
|
|
||||||
// Add stuck recovery hint from detector
|
// Add stuck recovery hint from detector (only for legacy mode; dynamic prompt handles it)
|
||||||
if (stuck.isStuck()) {
|
if (!useDynamicPrompt && stuck.isStuck()) {
|
||||||
userPrompt += "\n\n" + stuck.getRecoveryHint();
|
userPrompt += "\n\n" + stuck.getRecoveryHint();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -433,11 +474,26 @@ export async function runAgentLoop(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── 6. Parse response ───────────────────────────────────
|
// ── 6. Parse response (retry once on failure) ─────────
|
||||||
const parsed = parseJsonResponse(rawResponse);
|
let parsed = parseJsonResponse(rawResponse);
|
||||||
|
if (!parsed || !parsed.action) {
|
||||||
|
console.warn(
|
||||||
|
`[Agent ${sessionId}] Parse failed at step ${step + 1}, retrying LLM. Raw: ${rawResponse.slice(0, 200)}`
|
||||||
|
);
|
||||||
|
try {
|
||||||
|
rawResponse = await llm.getAction(
|
||||||
|
systemPrompt,
|
||||||
|
userPrompt + "\n\nIMPORTANT: Your previous response was not valid JSON. You MUST respond with ONLY a valid JSON object.",
|
||||||
|
useScreenshot ? screenshot : undefined
|
||||||
|
);
|
||||||
|
parsed = parseJsonResponse(rawResponse);
|
||||||
|
} catch {
|
||||||
|
// retry failed, fall through
|
||||||
|
}
|
||||||
|
}
|
||||||
if (!parsed || !parsed.action) {
|
if (!parsed || !parsed.action) {
|
||||||
console.error(
|
console.error(
|
||||||
`[Agent ${sessionId}] Failed to parse LLM response at step ${step + 1}`
|
`[Agent ${sessionId}] Failed to parse LLM response at step ${step + 1}. Raw: ${rawResponse.slice(0, 300)}`
|
||||||
);
|
);
|
||||||
stuck.recordAction("parse_error", screenHash);
|
stuck.recordAction("parse_error", screenHash);
|
||||||
lastActionFeedback = "parse_error -> FAILED: Could not parse LLM response";
|
lastActionFeedback = "parse_error -> FAILED: Could not parse LLM response";
|
||||||
|
|||||||
Reference in New Issue
Block a user