feat(agent): add dynamic prompt builder for Stage 3 UI agent

This commit is contained in:
Sanju Sivalingam
2026-02-18 00:22:24 +05:30
parent 91a828452b
commit 3f389c5de6

View File

@@ -244,6 +244,107 @@ ${capabilitySummary}`;
return { system, user };
}
/**
* Builds a dynamic system prompt for Stage 3 (UI Agent).
*
* Instead of listing all 23 actions every step, shows only actions
* relevant to the current screen state. Results in ~50 lines vs 211.
*/
export function buildDynamicPrompt(options: {
hasEditableFields: boolean;
hasScrollable: boolean;
foregroundApp?: string;
appHints: string;
isStuck: boolean;
}): string {
const { hasEditableFields, hasScrollable, foregroundApp, appHints, isStuck } =
options;
// ── Base actions (always available) ──
let actions = `Navigation:
{"action": "tap", "coordinates": [x, y], "reason": "..."}
{"action": "longpress", "coordinates": [x, y], "reason": "..."}
{"action": "back", "reason": "Navigate back"}
{"action": "home", "reason": "Go to home screen"}
{"action": "wait", "reason": "Wait for screen to load"}
{"action": "done", "reason": "Task is complete"}`;
// ── Conditional actions ──
if (hasEditableFields) {
actions += `
Text Input (ALWAYS include coordinates to focus the correct field):
{"action": "type", "coordinates": [x, y], "text": "...", "reason": "..."}
{"action": "clear", "reason": "Clear current text field"}
{"action": "enter", "reason": "Press Enter/submit"}
{"action": "clipboard_set", "text": "...", "reason": "Set clipboard"}
{"action": "paste", "coordinates": [x, y], "reason": "Paste clipboard"}`;
}
if (hasScrollable) {
actions += `
Scrolling:
{"action": "scroll", "direction": "up|down|left|right", "reason": "Scroll to see more"}`;
}
// Always include app control (lightweight)
actions += `
App Control:
{"action": "launch", "package": "com.app.name", "reason": "Open app"}
{"action": "switch_app", "package": "com.app.name", "reason": "Switch app"}`;
// Multi-step actions (always useful)
actions += `
Multi-Step:
{"action": "read_screen", "reason": "Scroll through page, collect all text to clipboard"}
{"action": "find_and_tap", "query": "Button Label", "reason": "Find and tap element by text"}
{"action": "copy_visible_text", "reason": "Copy all visible text to clipboard"}`;
if (hasEditableFields) {
actions += `
{"action": "submit_message", "reason": "Find Send button and tap it"}
{"action": "compose_email", "query": "email@addr", "reason": "Fill email fields"}`;
}
// ── Build full prompt ──
let prompt = `You are an Android UI Agent. Achieve the sub-goal by interacting with the current screen.
You receive: GOAL, FOREGROUND_APP, LAST_ACTION_RESULT, SCREEN_CONTEXT (JSON elements with coordinates), SCREEN_CHANGE.
Output ONLY a valid JSON object with your next action. Include a "think" field with brief reasoning.
ACTIONS:
${actions}
RULES:
- Use coordinates from SCREEN_CONTEXT "center" field — never guess.
- Do NOT tap elements with "enabled": false.
- ALWAYS include "coordinates" with "type" action to focus the correct field.
- If SCREEN_CHANGE says "NOT changed", your last action had no effect — change strategy.
- Do NOT repeat an action that already failed.
- Say "done" as soon as the goal is achieved.
- COPY-PASTE: Use clipboard_set with text from SCREEN_CONTEXT (most reliable), then paste. Or just type directly.`;
if (isStuck) {
prompt += `
STUCK RECOVERY — your current approach is NOT working:
1. The action may have SUCCEEDED SILENTLY (copy/share/like buttons work without screen changes) — move on.
2. Use programmatic alternatives (clipboard_set, type, launch).
3. Try a completely different UI element or approach.
4. Navigate away (back/home) ONLY as absolute last resort.`;
}
if (appHints) {
prompt += appHints;
}
return prompt;
}
// ─── Provider Implementation ────────────────────────────────────
const BASE_URLS: Record<string, string> = {