Flatten project structure: move android-action-kernel/ to root
Removes the unnecessary nesting — all source, config, and docs now live at the project root for simpler paths and commands. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
390
src/actions.ts
Normal file
390
src/actions.ts
Normal file
@@ -0,0 +1,390 @@
|
||||
/**
|
||||
* Action execution module for Android Action Kernel.
|
||||
* Handles all ADB commands for interacting with Android devices.
|
||||
*
|
||||
* Supported actions:
|
||||
* tap, type, enter, swipe, home, back, wait, done,
|
||||
* longpress, screenshot, launch, clear, clipboard_get, clipboard_set, shell
|
||||
*/
|
||||
|
||||
import { Config } from "./config.js";
|
||||
import {
|
||||
KEYCODE_ENTER,
|
||||
KEYCODE_HOME,
|
||||
KEYCODE_BACK,
|
||||
KEYCODE_DEL,
|
||||
KEYCODE_MOVE_HOME,
|
||||
KEYCODE_MOVE_END,
|
||||
SWIPE_COORDS,
|
||||
SWIPE_DURATION_MS,
|
||||
LONG_PRESS_DURATION_MS,
|
||||
DEVICE_SCREENSHOT_PATH,
|
||||
LOCAL_SCREENSHOT_PATH,
|
||||
computeSwipeCoords,
|
||||
} from "./constants.js";
|
||||
|
||||
export interface ActionDecision {
|
||||
action: string;
|
||||
coordinates?: [number, number];
|
||||
text?: string;
|
||||
direction?: string;
|
||||
reason?: string;
|
||||
// launch action
|
||||
package?: string;
|
||||
activity?: string;
|
||||
uri?: string;
|
||||
extras?: Record<string, string>;
|
||||
// shell action
|
||||
command?: string;
|
||||
// screenshot action
|
||||
filename?: string;
|
||||
// planning fields (Phase 4B)
|
||||
think?: string;
|
||||
plan?: string[];
|
||||
planProgress?: string;
|
||||
}
|
||||
|
||||
export interface ActionResult {
|
||||
success: boolean;
|
||||
message: string;
|
||||
data?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Executes a shell command via ADB with retry support.
|
||||
*/
|
||||
export function runAdbCommand(command: string[], retries = Config.MAX_RETRIES): string {
|
||||
for (let attempt = 0; attempt <= retries; attempt++) {
|
||||
const result = Bun.spawnSync([Config.ADB_PATH, ...command], {
|
||||
stdout: "pipe",
|
||||
stderr: "pipe",
|
||||
});
|
||||
|
||||
const stdout = result.stdout.toString().trim();
|
||||
const stderr = result.stderr.toString().trim();
|
||||
|
||||
if (stderr && stderr.toLowerCase().includes("error")) {
|
||||
if (attempt < retries) {
|
||||
const delay = Math.pow(2, attempt) * 1000;
|
||||
console.log(`ADB Error (attempt ${attempt + 1}/${retries + 1}): ${stderr}`);
|
||||
console.log(`Retrying in ${delay / 1000}s...`);
|
||||
Bun.sleepSync(delay);
|
||||
continue;
|
||||
}
|
||||
console.log(`ADB Error (all retries exhausted): ${stderr}`);
|
||||
}
|
||||
|
||||
return stdout;
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Device Intelligence (Phase 1)
|
||||
// ===========================================
|
||||
|
||||
/** Module-level dynamic swipe coords, set by initDeviceContext() */
|
||||
let dynamicSwipeCoords: Record<string, [number, number, number, number]> | null = null;
|
||||
|
||||
/**
|
||||
* Detects the connected device's screen resolution via ADB.
|
||||
* Returns [width, height] or null on failure.
|
||||
*/
|
||||
export function getScreenResolution(): [number, number] | null {
|
||||
try {
|
||||
const output = runAdbCommand(["shell", "wm", "size"]);
|
||||
// Try "Override size:" first, then "Physical size:"
|
||||
const overrideMatch = output.match(/Override size:\s*(\d+)x(\d+)/);
|
||||
if (overrideMatch) {
|
||||
return [parseInt(overrideMatch[1], 10), parseInt(overrideMatch[2], 10)];
|
||||
}
|
||||
const physicalMatch = output.match(/Physical size:\s*(\d+)x(\d+)/);
|
||||
if (physicalMatch) {
|
||||
return [parseInt(physicalMatch[1], 10), parseInt(physicalMatch[2], 10)];
|
||||
}
|
||||
} catch {
|
||||
console.log("Warning: Could not detect screen resolution.");
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detects the currently running foreground app.
|
||||
* Returns "package/activity" or null on failure.
|
||||
*/
|
||||
export function getForegroundApp(): string | null {
|
||||
try {
|
||||
const output = runAdbCommand([
|
||||
"shell", "dumpsys", "activity", "activities",
|
||||
]);
|
||||
// Match mResumedActivity line
|
||||
const match = output.match(/mResumedActivity.*?(\S+\/\S+)/);
|
||||
if (match) {
|
||||
return match[1].replace("}", "");
|
||||
}
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Stores dynamic swipe coordinates based on detected resolution.
|
||||
* Must be called once at startup.
|
||||
*/
|
||||
export function initDeviceContext(resolution: [number, number]): void {
|
||||
dynamicSwipeCoords = computeSwipeCoords(resolution[0], resolution[1]);
|
||||
}
|
||||
|
||||
/** Returns dynamic swipe coords if set, otherwise falls back to hardcoded defaults. */
|
||||
function getSwipeCoords(): Record<string, [number, number, number, number]> {
|
||||
return dynamicSwipeCoords ?? SWIPE_COORDS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Executes the action decided by the LLM. Returns a result for the kernel to track.
|
||||
*/
|
||||
export function executeAction(action: ActionDecision): ActionResult {
|
||||
switch (action.action) {
|
||||
case "tap":
|
||||
return executeTap(action);
|
||||
case "type":
|
||||
return executeType(action);
|
||||
case "enter":
|
||||
return executeEnter();
|
||||
case "swipe":
|
||||
return executeSwipe(action);
|
||||
case "home":
|
||||
return executeHome();
|
||||
case "back":
|
||||
return executeBack();
|
||||
case "wait":
|
||||
return executeWait();
|
||||
case "done":
|
||||
return executeDone(action);
|
||||
case "longpress":
|
||||
return executeLongPress(action);
|
||||
case "screenshot":
|
||||
return executeScreenshot(action);
|
||||
case "launch":
|
||||
return executeLaunch(action);
|
||||
case "clear":
|
||||
return executeClear();
|
||||
case "clipboard_get":
|
||||
return executeClipboardGet();
|
||||
case "clipboard_set":
|
||||
return executeClipboardSet(action);
|
||||
case "shell":
|
||||
return executeShell(action);
|
||||
default:
|
||||
console.log(`Warning: Unknown action: ${action.action}`);
|
||||
return { success: false, message: `Unknown action: ${action.action}` };
|
||||
}
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Original actions (enhanced)
|
||||
// ===========================================
|
||||
|
||||
function executeTap(action: ActionDecision): ActionResult {
|
||||
const [x, y] = action.coordinates ?? [0, 0];
|
||||
console.log(`Tapping: (${x}, ${y})`);
|
||||
runAdbCommand(["shell", "input", "tap", String(x), String(y)]);
|
||||
return { success: true, message: `Tapped (${x}, ${y})` };
|
||||
}
|
||||
|
||||
function executeType(action: ActionDecision): ActionResult {
|
||||
const text = action.text ?? "";
|
||||
if (!text) return { success: false, message: "No text to type" };
|
||||
// ADB requires %s for spaces, escape special shell characters
|
||||
const escapedText = text
|
||||
.replaceAll("\\", "\\\\")
|
||||
.replaceAll("\"", "\\\"")
|
||||
.replaceAll("'", "\\'")
|
||||
.replaceAll(" ", "%s")
|
||||
.replaceAll("&", "\\&")
|
||||
.replaceAll("|", "\\|")
|
||||
.replaceAll(";", "\\;")
|
||||
.replaceAll("(", "\\(")
|
||||
.replaceAll(")", "\\)")
|
||||
.replaceAll("<", "\\<")
|
||||
.replaceAll(">", "\\>");
|
||||
console.log(`Typing: ${text}`);
|
||||
runAdbCommand(["shell", "input", "text", escapedText]);
|
||||
return { success: true, message: `Typed "${text}"` };
|
||||
}
|
||||
|
||||
function executeEnter(): ActionResult {
|
||||
console.log("Pressing Enter");
|
||||
runAdbCommand(["shell", "input", "keyevent", KEYCODE_ENTER]);
|
||||
return { success: true, message: "Pressed Enter" };
|
||||
}
|
||||
|
||||
function executeSwipe(action: ActionDecision): ActionResult {
|
||||
const direction = action.direction ?? "up";
|
||||
const swipeCoords = getSwipeCoords();
|
||||
const coords = swipeCoords[direction] ?? swipeCoords["up"];
|
||||
|
||||
console.log(`Swiping ${direction}`);
|
||||
runAdbCommand([
|
||||
"shell", "input", "swipe",
|
||||
String(coords[0]), String(coords[1]),
|
||||
String(coords[2]), String(coords[3]),
|
||||
SWIPE_DURATION_MS,
|
||||
]);
|
||||
return { success: true, message: `Swiped ${direction}` };
|
||||
}
|
||||
|
||||
function executeHome(): ActionResult {
|
||||
console.log("Going Home");
|
||||
runAdbCommand(["shell", "input", "keyevent", KEYCODE_HOME]);
|
||||
return { success: true, message: "Went to home screen" };
|
||||
}
|
||||
|
||||
function executeBack(): ActionResult {
|
||||
console.log("Going Back");
|
||||
runAdbCommand(["shell", "input", "keyevent", KEYCODE_BACK]);
|
||||
return { success: true, message: "Went back" };
|
||||
}
|
||||
|
||||
function executeWait(): ActionResult {
|
||||
console.log("Waiting...");
|
||||
Bun.sleepSync(2000);
|
||||
return { success: true, message: "Waited 2s" };
|
||||
}
|
||||
|
||||
function executeDone(action: ActionDecision): ActionResult {
|
||||
console.log(`Goal Achieved: ${action.reason ?? "Task complete"}`);
|
||||
return { success: true, message: "done" };
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// New actions
|
||||
// ===========================================
|
||||
|
||||
/**
|
||||
* Long press at coordinates (opens context menus, triggers drag mode, etc.)
|
||||
*/
|
||||
function executeLongPress(action: ActionDecision): ActionResult {
|
||||
const [x, y] = action.coordinates ?? [0, 0];
|
||||
console.log(`Long pressing: (${x}, ${y})`);
|
||||
// A swipe from the same point to the same point with long duration = long press
|
||||
runAdbCommand([
|
||||
"shell", "input", "swipe",
|
||||
String(x), String(y), String(x), String(y),
|
||||
LONG_PRESS_DURATION_MS,
|
||||
]);
|
||||
return { success: true, message: `Long pressed (${x}, ${y})` };
|
||||
}
|
||||
|
||||
/**
|
||||
* Captures a screenshot and saves it locally.
|
||||
*/
|
||||
function executeScreenshot(action: ActionDecision): ActionResult {
|
||||
const filename = action.filename ?? LOCAL_SCREENSHOT_PATH;
|
||||
console.log(`Taking screenshot → ${filename}`);
|
||||
runAdbCommand(["shell", "screencap", "-p", DEVICE_SCREENSHOT_PATH]);
|
||||
runAdbCommand(["pull", DEVICE_SCREENSHOT_PATH, filename]);
|
||||
return { success: true, message: `Screenshot saved to ${filename}`, data: filename };
|
||||
}
|
||||
|
||||
/**
|
||||
* Launches an app by package name, activity, or URI intent.
|
||||
*
|
||||
* Examples the LLM can produce:
|
||||
* { action: "launch", package: "com.whatsapp" }
|
||||
* { action: "launch", package: "com.whatsapp", activity: ".HomeActivity" }
|
||||
* { action: "launch", uri: "https://maps.google.com/?q=pizza+near+me" }
|
||||
* { action: "launch", package: "com.whatsapp", uri: "content://media/external/images/1",
|
||||
* extras: { "android.intent.extra.TEXT": "Check this out" } }
|
||||
*/
|
||||
function executeLaunch(action: ActionDecision): ActionResult {
|
||||
const args: string[] = ["shell", "am", "start"];
|
||||
|
||||
if (action.uri) {
|
||||
args.push("-a", "android.intent.action.VIEW");
|
||||
args.push("-d", action.uri);
|
||||
}
|
||||
|
||||
if (action.package && action.activity) {
|
||||
args.push("-n", `${action.package}/${action.activity}`);
|
||||
} else if (action.package) {
|
||||
// Launch the default activity for the package
|
||||
const launchResult = runAdbCommand([
|
||||
"shell", "monkey", "-p", action.package, "-c",
|
||||
"android.intent.category.LAUNCHER", "1",
|
||||
]);
|
||||
console.log(`Launching: ${action.package}`);
|
||||
return { success: true, message: `Launched ${action.package}`, data: launchResult };
|
||||
}
|
||||
|
||||
// Attach intent extras
|
||||
if (action.extras) {
|
||||
for (const [key, value] of Object.entries(action.extras)) {
|
||||
args.push("--es", key, value);
|
||||
}
|
||||
}
|
||||
|
||||
const label = action.package ?? action.uri ?? "intent";
|
||||
console.log(`Launching: ${label}`);
|
||||
const result = runAdbCommand(args);
|
||||
return { success: true, message: `Launched ${label}`, data: result };
|
||||
}
|
||||
|
||||
/**
|
||||
* Clears the currently focused text field.
|
||||
* Selects all text then deletes it.
|
||||
*/
|
||||
function executeClear(): ActionResult {
|
||||
console.log("Clearing text field");
|
||||
// Move to end of field
|
||||
runAdbCommand(["shell", "input", "keyevent", KEYCODE_MOVE_END]);
|
||||
// Select all: Shift+Home
|
||||
runAdbCommand(["shell", "input", "keyevent", "--longpress", KEYCODE_MOVE_HOME]);
|
||||
// Delete selected text
|
||||
runAdbCommand(["shell", "input", "keyevent", KEYCODE_DEL]);
|
||||
return { success: true, message: "Cleared text field" };
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the current clipboard contents.
|
||||
*/
|
||||
function executeClipboardGet(): ActionResult {
|
||||
console.log("Reading clipboard");
|
||||
// Use am broadcast to get clipboard via a helper or service log
|
||||
// On Android 10+, direct clipboard access via ADB is restricted.
|
||||
// Workaround: dump the clipboard service log
|
||||
const result = runAdbCommand(["shell", "cmd", "clipboard", "get-text"]);
|
||||
if (result) {
|
||||
console.log(`Clipboard: ${result.slice(0, 100)}`);
|
||||
return { success: true, message: `Clipboard: ${result}`, data: result };
|
||||
}
|
||||
// Fallback for older Android versions
|
||||
const fallback = runAdbCommand([
|
||||
"shell", "service", "call", "clipboard", "2", "i32", "1",
|
||||
]);
|
||||
return { success: true, message: `Clipboard (raw): ${fallback}`, data: fallback };
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the clipboard to the given text.
|
||||
*/
|
||||
function executeClipboardSet(action: ActionDecision): ActionResult {
|
||||
const text = action.text ?? "";
|
||||
if (!text) return { success: false, message: "No text to set on clipboard" };
|
||||
console.log(`Setting clipboard: ${text.slice(0, 50)}...`);
|
||||
runAdbCommand(["shell", "cmd", "clipboard", "set-text", text]);
|
||||
return { success: true, message: `Clipboard set to "${text.slice(0, 50)}"` };
|
||||
}
|
||||
|
||||
/**
|
||||
* Runs an arbitrary ADB shell command. Use sparingly for edge cases.
|
||||
*/
|
||||
function executeShell(action: ActionDecision): ActionResult {
|
||||
const cmd = action.command ?? "";
|
||||
if (!cmd) return { success: false, message: "No command provided" };
|
||||
console.log(`Shell: ${cmd}`);
|
||||
const result = runAdbCommand(["shell", ...cmd.split(" ")]);
|
||||
return { success: true, message: `Shell output: ${result.slice(0, 200)}`, data: result };
|
||||
}
|
||||
99
src/config.ts
Normal file
99
src/config.ts
Normal file
@@ -0,0 +1,99 @@
|
||||
/**
|
||||
* Configuration management for Android Action Kernel.
|
||||
* Bun natively loads .env files — no dotenv needed.
|
||||
*/
|
||||
|
||||
import {
|
||||
DEVICE_DUMP_PATH,
|
||||
LOCAL_DUMP_PATH,
|
||||
DEVICE_SCREENSHOT_PATH,
|
||||
LOCAL_SCREENSHOT_PATH,
|
||||
DEFAULT_MAX_STEPS,
|
||||
DEFAULT_STEP_DELAY,
|
||||
DEFAULT_GROQ_MODEL,
|
||||
DEFAULT_OPENAI_MODEL,
|
||||
DEFAULT_BEDROCK_MODEL,
|
||||
DEFAULT_MAX_RETRIES,
|
||||
DEFAULT_STUCK_THRESHOLD,
|
||||
DEFAULT_MAX_ELEMENTS,
|
||||
DEFAULT_LOG_DIR,
|
||||
DEFAULT_VISION_MODE,
|
||||
DEFAULT_MAX_HISTORY_STEPS,
|
||||
DEFAULT_STREAMING_ENABLED,
|
||||
type VisionMode,
|
||||
} from "./constants.js";
|
||||
|
||||
function env(key: string, fallback = ""): string {
|
||||
return process.env[key] ?? fallback;
|
||||
}
|
||||
|
||||
export const Config = {
|
||||
// ADB Configuration
|
||||
ADB_PATH: env("ADB_PATH", "adb"),
|
||||
SCREEN_DUMP_PATH: DEVICE_DUMP_PATH,
|
||||
LOCAL_DUMP_PATH: LOCAL_DUMP_PATH,
|
||||
DEVICE_SCREENSHOT_PATH: DEVICE_SCREENSHOT_PATH,
|
||||
LOCAL_SCREENSHOT_PATH: LOCAL_SCREENSHOT_PATH,
|
||||
|
||||
// Agent Configuration
|
||||
MAX_STEPS: parseInt(env("MAX_STEPS", String(DEFAULT_MAX_STEPS)), 10),
|
||||
STEP_DELAY: parseFloat(env("STEP_DELAY", String(DEFAULT_STEP_DELAY))),
|
||||
MAX_RETRIES: parseInt(env("MAX_RETRIES", String(DEFAULT_MAX_RETRIES)), 10),
|
||||
STUCK_THRESHOLD: parseInt(env("STUCK_THRESHOLD", String(DEFAULT_STUCK_THRESHOLD)), 10),
|
||||
|
||||
// Vision mode: "off" | "fallback" (only when tree empty) | "always" (every step)
|
||||
VISION_MODE: (env("VISION_MODE", DEFAULT_VISION_MODE) as VisionMode),
|
||||
|
||||
// Smart element filtering
|
||||
MAX_ELEMENTS: parseInt(env("MAX_ELEMENTS", String(DEFAULT_MAX_ELEMENTS)), 10),
|
||||
|
||||
// Session logging
|
||||
LOG_DIR: env("LOG_DIR", DEFAULT_LOG_DIR),
|
||||
|
||||
// Multi-turn memory
|
||||
MAX_HISTORY_STEPS: parseInt(env("MAX_HISTORY_STEPS", String(DEFAULT_MAX_HISTORY_STEPS)), 10),
|
||||
|
||||
// Streaming responses
|
||||
STREAMING_ENABLED: env("STREAMING_ENABLED", String(DEFAULT_STREAMING_ENABLED)) === "true",
|
||||
|
||||
// LLM Provider: "groq", "openai", "bedrock", or "openrouter"
|
||||
LLM_PROVIDER: env("LLM_PROVIDER", "groq"),
|
||||
|
||||
// Groq Configuration
|
||||
GROQ_API_KEY: env("GROQ_API_KEY"),
|
||||
GROQ_MODEL: env("GROQ_MODEL", DEFAULT_GROQ_MODEL),
|
||||
|
||||
// OpenAI Configuration
|
||||
OPENAI_API_KEY: env("OPENAI_API_KEY"),
|
||||
OPENAI_MODEL: env("OPENAI_MODEL", DEFAULT_OPENAI_MODEL),
|
||||
|
||||
// AWS Bedrock Configuration
|
||||
AWS_REGION: env("AWS_REGION", "us-east-1"),
|
||||
BEDROCK_MODEL: env("BEDROCK_MODEL", DEFAULT_BEDROCK_MODEL),
|
||||
|
||||
// OpenRouter Configuration (via Vercel AI SDK)
|
||||
OPENROUTER_API_KEY: env("OPENROUTER_API_KEY"),
|
||||
OPENROUTER_MODEL: env("OPENROUTER_MODEL", "anthropic/claude-3.5-sonnet"),
|
||||
|
||||
getModel(): string {
|
||||
const provider = Config.LLM_PROVIDER;
|
||||
if (provider === "groq") return Config.GROQ_MODEL;
|
||||
if (provider === "bedrock") return Config.BEDROCK_MODEL;
|
||||
if (provider === "openrouter") return Config.OPENROUTER_MODEL;
|
||||
return Config.OPENAI_MODEL;
|
||||
},
|
||||
|
||||
validate(): void {
|
||||
const provider = Config.LLM_PROVIDER;
|
||||
if (provider === "groq" && !Config.GROQ_API_KEY) {
|
||||
throw new Error("GROQ_API_KEY is required when using Groq provider");
|
||||
}
|
||||
if (provider === "openai" && !Config.OPENAI_API_KEY) {
|
||||
throw new Error("OPENAI_API_KEY is required when using OpenAI provider");
|
||||
}
|
||||
if (provider === "openrouter" && !Config.OPENROUTER_API_KEY) {
|
||||
throw new Error("OPENROUTER_API_KEY is required when using OpenRouter provider");
|
||||
}
|
||||
// Bedrock uses AWS credential chain, no explicit validation needed
|
||||
},
|
||||
};
|
||||
118
src/constants.ts
Normal file
118
src/constants.ts
Normal file
@@ -0,0 +1,118 @@
|
||||
/**
|
||||
* Constants for Android Action Kernel.
|
||||
* All magic strings, URLs, and fixed values in one place.
|
||||
*/
|
||||
|
||||
// ===========================================
|
||||
// API Endpoints
|
||||
// ===========================================
|
||||
export const GROQ_API_BASE_URL = "https://api.groq.com/openai/v1";
|
||||
|
||||
// ===========================================
|
||||
// ADB Key Codes
|
||||
// ===========================================
|
||||
export const KEYCODE_ENTER = "66";
|
||||
export const KEYCODE_HOME = "KEYCODE_HOME";
|
||||
export const KEYCODE_BACK = "KEYCODE_BACK";
|
||||
export const KEYCODE_DEL = "67";
|
||||
export const KEYCODE_FORWARD_DEL = "112";
|
||||
export const KEYCODE_MOVE_HOME = "122";
|
||||
export const KEYCODE_MOVE_END = "123";
|
||||
export const KEYCODE_MENU = "82";
|
||||
export const KEYCODE_TAB = "61";
|
||||
export const KEYCODE_ESCAPE = "111";
|
||||
export const KEYCODE_DPAD_UP = "19";
|
||||
export const KEYCODE_DPAD_DOWN = "20";
|
||||
export const KEYCODE_DPAD_LEFT = "21";
|
||||
export const KEYCODE_DPAD_RIGHT = "22";
|
||||
export const KEYCODE_VOLUME_UP = "24";
|
||||
export const KEYCODE_VOLUME_DOWN = "25";
|
||||
export const KEYCODE_POWER = "26";
|
||||
|
||||
// ===========================================
|
||||
// Default Screen Coordinates (for swipe actions)
|
||||
// Adjust based on target device resolution
|
||||
// ===========================================
|
||||
export const SCREEN_CENTER_X = 540;
|
||||
export const SCREEN_CENTER_Y = 1200;
|
||||
|
||||
// Swipe coordinates: [start_x, start_y, end_x, end_y]
|
||||
// These are the fallback values for 1080x2400 screens
|
||||
export const SWIPE_COORDS: Record<string, [number, number, number, number]> = {
|
||||
up: [SCREEN_CENTER_X, 1500, SCREEN_CENTER_X, 500],
|
||||
down: [SCREEN_CENTER_X, 500, SCREEN_CENTER_X, 1500],
|
||||
left: [800, SCREEN_CENTER_Y, 200, SCREEN_CENTER_Y],
|
||||
right: [200, SCREEN_CENTER_Y, 800, SCREEN_CENTER_Y],
|
||||
};
|
||||
|
||||
/**
|
||||
* Derives swipe coordinates from actual screen dimensions using ratios
|
||||
* from the hardcoded 1080x2400 reference values.
|
||||
*/
|
||||
export function computeSwipeCoords(
|
||||
width: number,
|
||||
height: number
|
||||
): Record<string, [number, number, number, number]> {
|
||||
const cx = Math.floor(width / 2);
|
||||
const cy = Math.floor(height / 2);
|
||||
// Vertical swipe: from 62.5% to 20.8% of height (mirrors 1500→500 on 2400h)
|
||||
const vTop = Math.floor(height * 0.208);
|
||||
const vBottom = Math.floor(height * 0.625);
|
||||
// Horizontal swipe: from 74% to 18.5% of width (mirrors 800→200 on 1080w)
|
||||
const hLeft = Math.floor(width * 0.185);
|
||||
const hRight = Math.floor(width * 0.741);
|
||||
|
||||
return {
|
||||
up: [cx, vBottom, cx, vTop],
|
||||
down: [cx, vTop, cx, vBottom],
|
||||
left: [hRight, cy, hLeft, cy],
|
||||
right: [hLeft, cy, hRight, cy],
|
||||
};
|
||||
}
|
||||
export const SWIPE_DURATION_MS = "300";
|
||||
export const LONG_PRESS_DURATION_MS = "1000";
|
||||
|
||||
// ===========================================
|
||||
// Default Models
|
||||
// ===========================================
|
||||
export const DEFAULT_GROQ_MODEL = "llama-3.3-70b-versatile";
|
||||
export const DEFAULT_OPENAI_MODEL = "gpt-4o";
|
||||
export const DEFAULT_BEDROCK_MODEL = "us.meta.llama3-3-70b-instruct-v1:0";
|
||||
export const DEFAULT_OPENROUTER_MODEL = "anthropic/claude-3.5-sonnet";
|
||||
|
||||
// ===========================================
|
||||
// Bedrock Model Identifiers
|
||||
// ===========================================
|
||||
export const BEDROCK_ANTHROPIC_MODELS = ["anthropic"];
|
||||
export const BEDROCK_META_MODELS = ["meta", "llama"];
|
||||
|
||||
// ===========================================
|
||||
// File Paths
|
||||
// ===========================================
|
||||
export const DEVICE_DUMP_PATH = "/sdcard/window_dump.xml";
|
||||
export const LOCAL_DUMP_PATH = "window_dump.xml";
|
||||
export const DEVICE_SCREENSHOT_PATH = "/sdcard/kernel_screenshot.png";
|
||||
export const LOCAL_SCREENSHOT_PATH = "kernel_screenshot.png";
|
||||
|
||||
// ===========================================
|
||||
// Agent Defaults
|
||||
// ===========================================
|
||||
export const DEFAULT_MAX_STEPS = 30;
|
||||
export const DEFAULT_STEP_DELAY = 2.0;
|
||||
export const DEFAULT_MAX_RETRIES = 3;
|
||||
export const DEFAULT_STUCK_THRESHOLD = 3;
|
||||
export const DEFAULT_VISION_ENABLED = true;
|
||||
|
||||
// Phase 2: Context Quality
|
||||
export const DEFAULT_MAX_ELEMENTS = 40;
|
||||
export const DEFAULT_LOG_DIR = "logs";
|
||||
|
||||
// Phase 3: Vision Mode
|
||||
export type VisionMode = "off" | "fallback" | "always";
|
||||
export const DEFAULT_VISION_MODE: VisionMode = "fallback";
|
||||
|
||||
// Phase 4: Multi-turn Memory
|
||||
export const DEFAULT_MAX_HISTORY_STEPS = 10;
|
||||
|
||||
// Phase 5: Streaming
|
||||
export const DEFAULT_STREAMING_ENABLED = true;
|
||||
416
src/kernel.ts
Normal file
416
src/kernel.ts
Normal file
@@ -0,0 +1,416 @@
|
||||
/**
|
||||
* Android Action Kernel - Main Agent Loop (TypeScript/Bun Edition)
|
||||
*
|
||||
* An AI agent that controls Android devices through the accessibility API.
|
||||
* Uses LLMs to make decisions based on screen context.
|
||||
*
|
||||
* Features:
|
||||
* - Perception -> Reasoning -> Action loop
|
||||
* - Screen state diffing (stuck loop detection)
|
||||
* - Error recovery with retries
|
||||
* - Vision fallback & always-on multimodal screenshots
|
||||
* - Dynamic early exit on goal completion
|
||||
* - Smart element filtering (compact JSON, top-N scoring)
|
||||
* - Multi-turn conversation memory
|
||||
* - Multi-step planning (think/plan/planProgress)
|
||||
* - Streaming LLM responses
|
||||
* - Session logging with crash-safe partial writes
|
||||
* - Auto-detect screen resolution & foreground app
|
||||
* - 15 actions: tap, type, enter, swipe, home, back, wait, done,
|
||||
* longpress, screenshot, launch, clear, clipboard_get, clipboard_set, shell
|
||||
*
|
||||
* Usage:
|
||||
* bun run src/kernel.ts
|
||||
*/
|
||||
|
||||
import { existsSync, readFileSync } from "fs";
|
||||
|
||||
import { Config } from "./config.js";
|
||||
import {
|
||||
executeAction,
|
||||
runAdbCommand,
|
||||
getScreenResolution,
|
||||
getForegroundApp,
|
||||
initDeviceContext,
|
||||
type ActionDecision,
|
||||
type ActionResult,
|
||||
} from "./actions.js";
|
||||
import {
|
||||
getLlmProvider,
|
||||
trimMessages,
|
||||
SYSTEM_PROMPT,
|
||||
type LLMProvider,
|
||||
type ChatMessage,
|
||||
type ContentPart,
|
||||
} from "./llm-providers.js";
|
||||
import {
|
||||
getInteractiveElements,
|
||||
computeScreenHash,
|
||||
filterElements,
|
||||
type UIElement,
|
||||
} from "./sanitizer.js";
|
||||
import {
|
||||
DEVICE_SCREENSHOT_PATH,
|
||||
LOCAL_SCREENSHOT_PATH,
|
||||
} from "./constants.js";
|
||||
import { SessionLogger } from "./logger.js";
|
||||
|
||||
// ===========================================
|
||||
// Screen Perception
|
||||
// ===========================================
|
||||
|
||||
interface ScreenState {
|
||||
elements: UIElement[];
|
||||
compactJson: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Dumps the current UI XML and returns parsed elements + compact filtered JSON for the LLM.
|
||||
*/
|
||||
function getScreenState(): ScreenState {
|
||||
try {
|
||||
runAdbCommand(["shell", "uiautomator", "dump", Config.SCREEN_DUMP_PATH]);
|
||||
runAdbCommand(["pull", Config.SCREEN_DUMP_PATH, Config.LOCAL_DUMP_PATH]);
|
||||
} catch {
|
||||
console.log("Warning: ADB screen capture failed.");
|
||||
return { elements: [], compactJson: "Error: Could not capture screen." };
|
||||
}
|
||||
|
||||
if (!existsSync(Config.LOCAL_DUMP_PATH)) {
|
||||
return { elements: [], compactJson: "Error: Could not capture screen." };
|
||||
}
|
||||
|
||||
const xmlContent = readFileSync(Config.LOCAL_DUMP_PATH, "utf-8");
|
||||
const elements = getInteractiveElements(xmlContent);
|
||||
const compact = filterElements(elements, Config.MAX_ELEMENTS);
|
||||
return { elements, compactJson: JSON.stringify(compact) };
|
||||
}
|
||||
|
||||
/**
|
||||
* Captures a screenshot and returns the base64-encoded PNG, or null on failure.
|
||||
*/
|
||||
function captureScreenshotBase64(): string | null {
|
||||
try {
|
||||
runAdbCommand(["shell", "screencap", "-p", DEVICE_SCREENSHOT_PATH]);
|
||||
runAdbCommand(["pull", DEVICE_SCREENSHOT_PATH, LOCAL_SCREENSHOT_PATH]);
|
||||
if (existsSync(LOCAL_SCREENSHOT_PATH)) {
|
||||
const buffer = readFileSync(LOCAL_SCREENSHOT_PATH);
|
||||
return Buffer.from(buffer).toString("base64");
|
||||
}
|
||||
} catch {
|
||||
console.log("Warning: Screenshot capture failed.");
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Screen State Diffing
|
||||
// ===========================================
|
||||
|
||||
interface ScreenDiff {
|
||||
changed: boolean;
|
||||
addedTexts: string[];
|
||||
removedTexts: string[];
|
||||
summary: string;
|
||||
}
|
||||
|
||||
function diffScreenState(
|
||||
prevElements: UIElement[],
|
||||
currElements: UIElement[]
|
||||
): ScreenDiff {
|
||||
const prevTexts = new Set(prevElements.map((e) => e.text).filter(Boolean));
|
||||
const currTexts = new Set(currElements.map((e) => e.text).filter(Boolean));
|
||||
|
||||
const addedTexts = [...currTexts].filter((t) => !prevTexts.has(t));
|
||||
const removedTexts = [...prevTexts].filter((t) => !currTexts.has(t));
|
||||
|
||||
const prevHash = computeScreenHash(prevElements);
|
||||
const currHash = computeScreenHash(currElements);
|
||||
const changed = prevHash !== currHash;
|
||||
|
||||
let summary = "";
|
||||
if (!changed) {
|
||||
summary = "Screen has NOT changed since last action.";
|
||||
} else {
|
||||
const parts: string[] = [];
|
||||
if (addedTexts.length > 0) {
|
||||
parts.push(`New on screen: ${addedTexts.slice(0, 5).join(", ")}`);
|
||||
}
|
||||
if (removedTexts.length > 0) {
|
||||
parts.push(`Gone from screen: ${removedTexts.slice(0, 5).join(", ")}`);
|
||||
}
|
||||
summary = parts.join(". ") || "Screen layout changed.";
|
||||
}
|
||||
|
||||
return { changed, addedTexts, removedTexts, summary };
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Streaming LLM Consumer
|
||||
// ===========================================
|
||||
|
||||
async function getDecisionStreaming(
|
||||
llm: LLMProvider,
|
||||
messages: ChatMessage[]
|
||||
): Promise<ActionDecision> {
|
||||
if (!Config.STREAMING_ENABLED || !llm.capabilities.supportsStreaming || !llm.getDecisionStream) {
|
||||
return llm.getDecision(messages);
|
||||
}
|
||||
|
||||
let accumulated = "";
|
||||
process.stdout.write("Thinking");
|
||||
for await (const chunk of llm.getDecisionStream(messages)) {
|
||||
accumulated += chunk;
|
||||
process.stdout.write(".");
|
||||
}
|
||||
process.stdout.write("\n");
|
||||
|
||||
return parseJsonResponse(accumulated);
|
||||
}
|
||||
|
||||
/** Simple JSON parser with markdown fallback (duplicated from llm-providers for streaming path) */
|
||||
function parseJsonResponse(text: string): ActionDecision {
|
||||
try {
|
||||
return JSON.parse(text);
|
||||
} catch {
|
||||
const match = text.match(/\{[\s\S]*?\}/);
|
||||
if (match) {
|
||||
try {
|
||||
return JSON.parse(match[0]);
|
||||
} catch {
|
||||
// fall through
|
||||
}
|
||||
}
|
||||
console.log(`Warning: Could not parse streamed response: ${text.slice(0, 200)}`);
|
||||
return { action: "wait", reason: "Failed to parse response, waiting" };
|
||||
}
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Main Agent Loop
|
||||
// ===========================================
|
||||
|
||||
async function runAgent(goal: string, maxSteps?: number): Promise<void> {
|
||||
const steps = maxSteps ?? Config.MAX_STEPS;
|
||||
|
||||
// Phase 1A: Auto-detect screen resolution
|
||||
const resolution = getScreenResolution();
|
||||
if (resolution) {
|
||||
initDeviceContext(resolution);
|
||||
console.log(`Screen resolution: ${resolution[0]}x${resolution[1]}`);
|
||||
} else {
|
||||
console.log("Screen resolution: using default 1080x2400 swipe coords");
|
||||
}
|
||||
|
||||
console.log("Android Action Kernel Started");
|
||||
console.log(`Goal: ${goal}`);
|
||||
console.log(`Provider: ${Config.LLM_PROVIDER} (${Config.getModel()})`);
|
||||
console.log(`Max steps: ${steps} | Step delay: ${Config.STEP_DELAY}s`);
|
||||
console.log(`Vision: ${Config.VISION_MODE} | Streaming: ${Config.STREAMING_ENABLED}`);
|
||||
console.log(`Max elements: ${Config.MAX_ELEMENTS} | History: ${Config.MAX_HISTORY_STEPS} steps`);
|
||||
|
||||
const llm = getLlmProvider();
|
||||
|
||||
// Phase 2B: Session logging
|
||||
const logger = new SessionLogger(
|
||||
Config.LOG_DIR,
|
||||
goal,
|
||||
Config.LLM_PROVIDER,
|
||||
Config.getModel()
|
||||
);
|
||||
|
||||
// Phase 4A: Multi-turn conversation memory
|
||||
const messages: ChatMessage[] = [
|
||||
{ role: "system", content: SYSTEM_PROMPT },
|
||||
];
|
||||
|
||||
let prevElements: UIElement[] = [];
|
||||
let stuckCount = 0;
|
||||
|
||||
for (let step = 0; step < steps; step++) {
|
||||
console.log(`\n--- Step ${step + 1}/${steps} ---`);
|
||||
|
||||
// 1. Perception: Capture screen state
|
||||
console.log("Scanning screen...");
|
||||
const { elements, compactJson: screenContext } = getScreenState();
|
||||
|
||||
// 1B. Foreground app detection
|
||||
const foregroundApp = getForegroundApp();
|
||||
if (foregroundApp) {
|
||||
console.log(`Foreground: ${foregroundApp}`);
|
||||
}
|
||||
|
||||
// 2. Screen diff: detect stuck loops
|
||||
let diffContext = "";
|
||||
let screenChanged = true;
|
||||
if (step > 0) {
|
||||
const diff = diffScreenState(prevElements, elements);
|
||||
screenChanged = diff.changed;
|
||||
diffContext = `\n\nSCREEN_CHANGE: ${diff.summary}`;
|
||||
|
||||
if (!diff.changed) {
|
||||
stuckCount++;
|
||||
console.log(
|
||||
`Warning: Screen unchanged for ${stuckCount} step(s).`
|
||||
);
|
||||
if (stuckCount >= Config.STUCK_THRESHOLD) {
|
||||
console.log(
|
||||
`Stuck for ${stuckCount} steps. Injecting recovery hint.`
|
||||
);
|
||||
diffContext +=
|
||||
`\nWARNING: You have been stuck for ${stuckCount} steps. ` +
|
||||
`The screen is NOT changing. Try a DIFFERENT action: ` +
|
||||
`swipe to scroll, press back, go home, or launch a different app.` +
|
||||
`\nYour plan is not working. Create a NEW plan with a different approach.`;
|
||||
}
|
||||
} else {
|
||||
stuckCount = 0;
|
||||
}
|
||||
}
|
||||
prevElements = elements;
|
||||
|
||||
// 3. Vision: capture screenshot based on VISION_MODE
|
||||
let screenshotBase64: string | null = null;
|
||||
let visionContext = "";
|
||||
|
||||
const shouldCaptureVision =
|
||||
Config.VISION_MODE === "always" ||
|
||||
(Config.VISION_MODE === "fallback" && elements.length === 0);
|
||||
|
||||
if (shouldCaptureVision) {
|
||||
screenshotBase64 = captureScreenshotBase64();
|
||||
if (elements.length === 0) {
|
||||
visionContext =
|
||||
"\n\nVISION_FALLBACK: The accessibility tree returned NO elements. " +
|
||||
"A screenshot has been captured. The screen likely contains custom-drawn " +
|
||||
"content (game, WebView, or Flutter). Try using coordinate-based taps on " +
|
||||
"common UI positions, or use 'back'/'home' to navigate away.";
|
||||
}
|
||||
if (screenshotBase64 && llm.capabilities.supportsImages) {
|
||||
console.log("Sending screenshot to LLM");
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Build user message with all context
|
||||
const foregroundLine = foregroundApp
|
||||
? `FOREGROUND_APP: ${foregroundApp}\n\n`
|
||||
: "";
|
||||
const textContent =
|
||||
`GOAL: ${goal}\n\n${foregroundLine}SCREEN_CONTEXT:\n${screenContext}${diffContext}${visionContext}`;
|
||||
|
||||
// Build content parts (text + optional image)
|
||||
const userContent: ContentPart[] = [{ type: "text", text: textContent }];
|
||||
if (screenshotBase64 && llm.capabilities.supportsImages) {
|
||||
userContent.push({
|
||||
type: "image",
|
||||
base64: screenshotBase64,
|
||||
mimeType: "image/png",
|
||||
});
|
||||
}
|
||||
|
||||
messages.push({ role: "user", content: userContent });
|
||||
|
||||
// Trim messages to keep within history limit
|
||||
const trimmed = trimMessages(messages, Config.MAX_HISTORY_STEPS);
|
||||
|
||||
// 5. Reasoning: Get LLM decision
|
||||
const llmStart = performance.now();
|
||||
let decision: ActionDecision;
|
||||
try {
|
||||
decision = await getDecisionStreaming(llm, trimmed);
|
||||
} catch (err) {
|
||||
console.log(`LLM Error: ${(err as Error).message}`);
|
||||
console.log("Falling back to wait action.");
|
||||
decision = { action: "wait", reason: "LLM request failed, waiting" };
|
||||
}
|
||||
const llmLatency = performance.now() - llmStart;
|
||||
|
||||
// Log thinking and planning
|
||||
if (decision.think) {
|
||||
console.log(`Think: ${decision.think}`);
|
||||
}
|
||||
if (decision.plan) {
|
||||
console.log(`Plan: ${decision.plan.join(" -> ")}`);
|
||||
}
|
||||
if (decision.planProgress) {
|
||||
console.log(`Progress: ${decision.planProgress}`);
|
||||
}
|
||||
console.log(`Decision: ${decision.action} — ${decision.reason ?? "no reason"} (${Math.round(llmLatency)}ms)`);
|
||||
|
||||
// Append assistant response to conversation
|
||||
messages.push({
|
||||
role: "assistant",
|
||||
content: JSON.stringify(decision),
|
||||
});
|
||||
|
||||
// 6. Action: Execute the decision
|
||||
const actionStart = performance.now();
|
||||
let result: ActionResult;
|
||||
try {
|
||||
result = executeAction(decision);
|
||||
} catch (err) {
|
||||
console.log(`Action Error: ${(err as Error).message}`);
|
||||
result = { success: false, message: (err as Error).message };
|
||||
}
|
||||
const actionLatency = performance.now() - actionStart;
|
||||
|
||||
// Log step
|
||||
logger.logStep(
|
||||
step + 1,
|
||||
foregroundApp,
|
||||
elements.length,
|
||||
screenChanged,
|
||||
decision,
|
||||
result,
|
||||
Math.round(llmLatency),
|
||||
Math.round(actionLatency)
|
||||
);
|
||||
|
||||
console.log(`Messages in context: ${trimmed.length}`);
|
||||
|
||||
// 7. Check for goal completion
|
||||
if (decision.action === "done") {
|
||||
console.log("\nTask completed successfully.");
|
||||
logger.finalize(true);
|
||||
return;
|
||||
}
|
||||
|
||||
// Wait for UI to update
|
||||
await Bun.sleep(Config.STEP_DELAY * 1000);
|
||||
}
|
||||
|
||||
console.log("\nMax steps reached. Task may be incomplete.");
|
||||
logger.finalize(false);
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Entry Point
|
||||
// ===========================================
|
||||
|
||||
async function main(): Promise<void> {
|
||||
try {
|
||||
Config.validate();
|
||||
} catch (e) {
|
||||
console.log(`Configuration Error: ${(e as Error).message}`);
|
||||
return;
|
||||
}
|
||||
|
||||
// Read user input from stdin
|
||||
process.stdout.write("Enter your goal: ");
|
||||
const goal = await new Promise<string>((resolve) => {
|
||||
const reader = Bun.stdin.stream().getReader();
|
||||
reader.read().then(({ value }) => {
|
||||
resolve(new TextDecoder().decode(value).trim());
|
||||
reader.releaseLock();
|
||||
});
|
||||
});
|
||||
|
||||
if (!goal) {
|
||||
console.log("No goal provided. Exiting.");
|
||||
return;
|
||||
}
|
||||
|
||||
await runAgent(goal);
|
||||
}
|
||||
|
||||
main();
|
||||
535
src/llm-providers.ts
Normal file
535
src/llm-providers.ts
Normal file
@@ -0,0 +1,535 @@
|
||||
/**
|
||||
* LLM Provider module for Android Action Kernel.
|
||||
* Supports OpenAI, Groq, AWS Bedrock, and OpenRouter (via Vercel AI SDK).
|
||||
*
|
||||
* Phase 3: Real multimodal vision (image content parts)
|
||||
* Phase 4A: Multi-turn conversation memory (ChatMessage[] interface)
|
||||
* Phase 5: Streaming responses (getDecisionStream)
|
||||
*/
|
||||
|
||||
import OpenAI from "openai";
|
||||
import {
|
||||
BedrockRuntimeClient,
|
||||
InvokeModelCommand,
|
||||
InvokeModelWithResponseStreamCommand,
|
||||
} from "@aws-sdk/client-bedrock-runtime";
|
||||
import { generateText, streamText } from "ai";
|
||||
import { createOpenRouter } from "@openrouter/ai-sdk-provider";
|
||||
|
||||
import { Config } from "./config.js";
|
||||
import {
|
||||
GROQ_API_BASE_URL,
|
||||
BEDROCK_ANTHROPIC_MODELS,
|
||||
BEDROCK_META_MODELS,
|
||||
} from "./constants.js";
|
||||
import type { ActionDecision } from "./actions.js";
|
||||
|
||||
// ===========================================
|
||||
// System Prompt — all 15 actions + planning
|
||||
// ===========================================
|
||||
|
||||
export const SYSTEM_PROMPT = `You are an Android Driver Agent. Your job is to achieve the user's goal by navigating the Android UI.
|
||||
|
||||
You will receive:
|
||||
1. GOAL — the user's task.
|
||||
2. FOREGROUND_APP — the currently active app package and activity.
|
||||
3. SCREEN_CONTEXT — JSON array of interactive UI elements with coordinates and states.
|
||||
4. SCREENSHOT — an image of the current screen (when available).
|
||||
5. SCREEN_CHANGE — what changed since your last action (or if the screen is stuck).
|
||||
6. VISION_FALLBACK — present when the accessibility tree is empty (custom UI / WebView).
|
||||
|
||||
Previous conversation turns contain your earlier observations and actions (multi-turn memory).
|
||||
|
||||
You must output ONLY a valid JSON object with your next action.
|
||||
|
||||
═══════════════════════════════════════════
|
||||
THINKING & PLANNING
|
||||
═══════════════════════════════════════════
|
||||
|
||||
Before each action, include a "think" field with your reasoning about the current state and what to do next.
|
||||
|
||||
Optionally include:
|
||||
- "plan": an array of 3-5 high-level steps to achieve the goal
|
||||
- "planProgress": a brief note on which plan step you're currently on
|
||||
|
||||
Example:
|
||||
{"think": "I see the Settings app is open. I need to scroll down to find Display settings.", "plan": ["Open Settings", "Navigate to Display", "Change theme to dark", "Verify change"], "planProgress": "Step 2: navigating to Display", "action": "swipe", "direction": "up", "reason": "Scroll down to find Display option"}
|
||||
|
||||
═══════════════════════════════════════════
|
||||
AVAILABLE ACTIONS (15 total)
|
||||
═══════════════════════════════════════════
|
||||
|
||||
Navigation:
|
||||
{"action": "tap", "coordinates": [x, y], "reason": "..."}
|
||||
{"action": "longpress", "coordinates": [x, y], "reason": "..."}
|
||||
{"action": "swipe", "direction": "up|down|left|right", "reason": "..."}
|
||||
{"action": "enter", "reason": "Press Enter/submit"}
|
||||
{"action": "back", "reason": "Navigate back"}
|
||||
{"action": "home", "reason": "Go to home screen"}
|
||||
|
||||
Text Input:
|
||||
{"action": "type", "text": "Hello World", "reason": "..."}
|
||||
{"action": "clear", "reason": "Clear current text field before typing"}
|
||||
|
||||
App Control:
|
||||
{"action": "launch", "package": "com.whatsapp", "reason": "Open WhatsApp"}
|
||||
{"action": "launch", "uri": "https://maps.google.com/?q=pizza", "reason": "Open URL"}
|
||||
{"action": "launch", "package": "com.whatsapp", "uri": "content://media/external/images/1", "extras": {"android.intent.extra.TEXT": "Check this"}, "reason": "Share image to WhatsApp"}
|
||||
|
||||
Data:
|
||||
{"action": "screenshot", "reason": "Capture current screen"}
|
||||
{"action": "screenshot", "filename": "order_confirmation.png", "reason": "Save proof"}
|
||||
{"action": "clipboard_get", "reason": "Read clipboard contents"}
|
||||
{"action": "clipboard_set", "text": "copied text", "reason": "Set clipboard"}
|
||||
|
||||
System:
|
||||
{"action": "shell", "command": "am force-stop com.app.broken", "reason": "Kill crashed app"}
|
||||
{"action": "wait", "reason": "Wait for screen to load"}
|
||||
{"action": "done", "reason": "Task is complete"}
|
||||
|
||||
═══════════════════════════════════════════
|
||||
ELEMENT PROPERTIES YOU WILL SEE
|
||||
═══════════════════════════════════════════
|
||||
|
||||
Each element in SCREEN_CONTEXT has:
|
||||
- text: visible label or content description
|
||||
- center: [x, y] coordinates to tap
|
||||
- action: suggested action — "tap", "type", "longpress", "scroll", or "read"
|
||||
- enabled: false (only shown when disabled — DO NOT tap disabled elements!)
|
||||
- checked: true (only shown for ON checkboxes/toggles)
|
||||
- focused: true (only shown when field has input focus)
|
||||
- hint: placeholder text (only shown when present)
|
||||
- editable: true (only shown for text input fields)
|
||||
- scrollable: true (only shown for scrollable containers)
|
||||
|
||||
═══════════════════════════════════════════
|
||||
CRITICAL RULES
|
||||
═══════════════════════════════════════════
|
||||
|
||||
1. DISABLED ELEMENTS: If "enabled": false, DO NOT tap or interact with it. Find an alternative.
|
||||
2. TEXT INPUT: If "editable": true, use "clear" first if field has existing text, then "type".
|
||||
3. ALREADY TYPED: Check your previous actions. Do NOT re-type text you already entered.
|
||||
4. REPETITION: Do NOT tap the same coordinates twice in a row. If it didn't work, try something else.
|
||||
5. STUCK: If SCREEN_CHANGE says "NOT changed", your last action had no effect. Change strategy.
|
||||
6. APP LAUNCH: Use "launch" to directly open apps instead of hunting for icons on the home screen.
|
||||
7. SCREENSHOTS: Use "screenshot" to capture proof of completed tasks (order confirmations, etc).
|
||||
8. LONG PRESS: Use "longpress" when you see "longClickable": true (context menus, copy/paste, etc).
|
||||
9. SCROLLING: If the item you need isn't visible, "swipe" up/down to scroll and find it.
|
||||
10. MULTI-APP: To switch apps, use "home" then "launch" the next app. Or use "back" to return.
|
||||
11. PASSWORDS: Never log or output the text of password fields.
|
||||
12. DONE: Say "done" as soon as the goal is achieved. Don't keep acting after success.
|
||||
13. SEARCH: After typing in a search field, use "enter" to submit the search.
|
||||
14. SHARE: To send files/images between apps, use "launch" with uri + extras for Android intents.
|
||||
15. CLEANUP: If a popup/ad appears, dismiss it with "back" or tap the close button, then continue.`;
|
||||
|
||||
// ===========================================
|
||||
// Chat Message Types (Phase 4A)
|
||||
// ===========================================
|
||||
|
||||
export type ContentPart =
|
||||
| { type: "text"; text: string }
|
||||
| { type: "image"; base64: string; mimeType: "image/png" | "image/jpeg" };
|
||||
|
||||
export interface ChatMessage {
|
||||
role: "system" | "user" | "assistant";
|
||||
content: string | ContentPart[];
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Provider Interface
|
||||
// ===========================================
|
||||
|
||||
export interface LLMProvider {
|
||||
readonly capabilities: {
|
||||
supportsImages: boolean;
|
||||
supportsStreaming: boolean;
|
||||
};
|
||||
getDecision(messages: ChatMessage[]): Promise<ActionDecision>;
|
||||
getDecisionStream?(messages: ChatMessage[]): AsyncIterable<string>;
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Message Trimming (Phase 4A)
|
||||
// ===========================================
|
||||
|
||||
/**
|
||||
* Trims conversation messages to keep within history limit.
|
||||
* Always keeps the system message. Drops oldest user/assistant pairs.
|
||||
*/
|
||||
export function trimMessages(
|
||||
messages: ChatMessage[],
|
||||
maxHistorySteps: number
|
||||
): ChatMessage[] {
|
||||
if (messages.length === 0) return messages;
|
||||
|
||||
// System message is always first
|
||||
const system = messages[0].role === "system" ? messages[0] : null;
|
||||
const rest = system ? messages.slice(1) : messages;
|
||||
|
||||
// Count user/assistant pairs (each step = 1 user + 1 assistant)
|
||||
const maxMessages = maxHistorySteps * 2;
|
||||
if (rest.length <= maxMessages) {
|
||||
return messages;
|
||||
}
|
||||
|
||||
const dropped = rest.length - maxMessages;
|
||||
const stepsDropped = Math.floor(dropped / 2);
|
||||
const trimmed = rest.slice(dropped);
|
||||
|
||||
// Insert a summary note
|
||||
const summary: ChatMessage = {
|
||||
role: "user",
|
||||
content: `[${stepsDropped} earlier steps omitted]`,
|
||||
};
|
||||
|
||||
return system ? [system, summary, ...trimmed] : [summary, ...trimmed];
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// OpenAI / Groq Provider
|
||||
// ===========================================
|
||||
|
||||
class OpenAIProvider implements LLMProvider {
|
||||
private client: OpenAI;
|
||||
private model: string;
|
||||
readonly capabilities: { supportsImages: boolean; supportsStreaming: boolean };
|
||||
|
||||
constructor() {
|
||||
if (Config.LLM_PROVIDER === "groq") {
|
||||
this.client = new OpenAI({
|
||||
apiKey: Config.GROQ_API_KEY,
|
||||
baseURL: GROQ_API_BASE_URL,
|
||||
});
|
||||
this.model = Config.GROQ_MODEL;
|
||||
this.capabilities = { supportsImages: false, supportsStreaming: true };
|
||||
} else {
|
||||
this.client = new OpenAI({ apiKey: Config.OPENAI_API_KEY });
|
||||
this.model = Config.OPENAI_MODEL;
|
||||
this.capabilities = { supportsImages: true, supportsStreaming: true };
|
||||
}
|
||||
}
|
||||
|
||||
private toOpenAIMessages(
|
||||
messages: ChatMessage[]
|
||||
): OpenAI.ChatCompletionMessageParam[] {
|
||||
return messages.map((msg) => {
|
||||
if (typeof msg.content === "string") {
|
||||
return { role: msg.role, content: msg.content } as OpenAI.ChatCompletionMessageParam;
|
||||
}
|
||||
// Convert ContentPart[] to OpenAI format
|
||||
const parts: OpenAI.ChatCompletionContentPart[] = msg.content.map(
|
||||
(part) => {
|
||||
if (part.type === "text") {
|
||||
return { type: "text" as const, text: part.text };
|
||||
}
|
||||
// Image — only for OpenAI (Groq skips images)
|
||||
if (this.capabilities.supportsImages) {
|
||||
return {
|
||||
type: "image_url" as const,
|
||||
image_url: {
|
||||
url: `data:${part.mimeType};base64,${part.base64}`,
|
||||
detail: "low" as const,
|
||||
},
|
||||
};
|
||||
}
|
||||
// Groq: convert image to text placeholder
|
||||
return { type: "text" as const, text: "[Screenshot attached]" };
|
||||
}
|
||||
);
|
||||
return {
|
||||
role: msg.role,
|
||||
content: parts,
|
||||
} as OpenAI.ChatCompletionMessageParam;
|
||||
});
|
||||
}
|
||||
|
||||
async getDecision(messages: ChatMessage[]): Promise<ActionDecision> {
|
||||
const openaiMessages = this.toOpenAIMessages(messages);
|
||||
const response = await this.client.chat.completions.create({
|
||||
model: this.model,
|
||||
response_format: { type: "json_object" },
|
||||
messages: openaiMessages,
|
||||
});
|
||||
return parseJsonResponse(response.choices[0].message.content ?? "{}");
|
||||
}
|
||||
|
||||
async *getDecisionStream(messages: ChatMessage[]): AsyncIterable<string> {
|
||||
const openaiMessages = this.toOpenAIMessages(messages);
|
||||
const stream = await this.client.chat.completions.create({
|
||||
model: this.model,
|
||||
response_format: { type: "json_object" },
|
||||
messages: openaiMessages,
|
||||
stream: true,
|
||||
});
|
||||
for await (const chunk of stream) {
|
||||
const content = chunk.choices[0]?.delta?.content;
|
||||
if (content) yield content;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// OpenRouter Provider (Vercel AI SDK)
|
||||
// ===========================================
|
||||
|
||||
class OpenRouterProvider implements LLMProvider {
|
||||
private openrouter: ReturnType<typeof createOpenRouter>;
|
||||
private model: string;
|
||||
readonly capabilities = { supportsImages: true, supportsStreaming: true };
|
||||
|
||||
constructor() {
|
||||
this.openrouter = createOpenRouter({
|
||||
apiKey: Config.OPENROUTER_API_KEY,
|
||||
});
|
||||
this.model = Config.OPENROUTER_MODEL;
|
||||
}
|
||||
|
||||
private toVercelMessages(messages: ChatMessage[]) {
|
||||
// Vercel AI SDK uses a similar format but we need to convert images
|
||||
const systemMsg = messages.find((m) => m.role === "system");
|
||||
const nonSystem = messages.filter((m) => m.role !== "system");
|
||||
|
||||
const converted = nonSystem.map((msg) => {
|
||||
if (typeof msg.content === "string") {
|
||||
return { role: msg.role as "user" | "assistant", content: msg.content };
|
||||
}
|
||||
const parts = msg.content.map((part) => {
|
||||
if (part.type === "text") {
|
||||
return { type: "text" as const, text: part.text };
|
||||
}
|
||||
return {
|
||||
type: "image" as const,
|
||||
image: `data:${part.mimeType};base64,${part.base64}`,
|
||||
};
|
||||
});
|
||||
return { role: msg.role as "user" | "assistant", content: parts };
|
||||
});
|
||||
|
||||
return {
|
||||
system: typeof systemMsg?.content === "string" ? systemMsg.content : "",
|
||||
messages: converted,
|
||||
};
|
||||
}
|
||||
|
||||
async getDecision(messages: ChatMessage[]): Promise<ActionDecision> {
|
||||
const { system, messages: converted } = this.toVercelMessages(messages);
|
||||
const result = await generateText({
|
||||
model: this.openrouter.chat(this.model),
|
||||
system,
|
||||
messages: converted as any,
|
||||
});
|
||||
return parseJsonResponse(result.text);
|
||||
}
|
||||
|
||||
async *getDecisionStream(messages: ChatMessage[]): AsyncIterable<string> {
|
||||
const { system, messages: converted } = this.toVercelMessages(messages);
|
||||
const result = streamText({
|
||||
model: this.openrouter.chat(this.model),
|
||||
system,
|
||||
messages: converted as any,
|
||||
});
|
||||
for await (const chunk of result.textStream) {
|
||||
yield chunk;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// AWS Bedrock Provider
|
||||
// ===========================================
|
||||
|
||||
class BedrockProvider implements LLMProvider {
|
||||
private client: BedrockRuntimeClient;
|
||||
private model: string;
|
||||
readonly capabilities: { supportsImages: boolean; supportsStreaming: boolean };
|
||||
|
||||
constructor() {
|
||||
this.client = new BedrockRuntimeClient({ region: Config.AWS_REGION });
|
||||
this.model = Config.BEDROCK_MODEL;
|
||||
// Only Anthropic models on Bedrock support images
|
||||
this.capabilities = {
|
||||
supportsImages: this.isAnthropicModel(),
|
||||
supportsStreaming: true,
|
||||
};
|
||||
}
|
||||
|
||||
private isAnthropicModel(): boolean {
|
||||
return BEDROCK_ANTHROPIC_MODELS.some((id) => this.model.includes(id));
|
||||
}
|
||||
|
||||
private isMetaModel(): boolean {
|
||||
return BEDROCK_META_MODELS.some((id) =>
|
||||
this.model.toLowerCase().includes(id)
|
||||
);
|
||||
}
|
||||
|
||||
private buildAnthropicMessages(messages: ChatMessage[]) {
|
||||
const systemMsg = messages.find((m) => m.role === "system");
|
||||
const nonSystem = messages.filter((m) => m.role !== "system");
|
||||
|
||||
const converted = nonSystem.map((msg) => {
|
||||
if (typeof msg.content === "string") {
|
||||
return { role: msg.role, content: msg.content };
|
||||
}
|
||||
const parts = msg.content.map((part) => {
|
||||
if (part.type === "text") {
|
||||
return { type: "text", text: part.text };
|
||||
}
|
||||
return {
|
||||
type: "image",
|
||||
source: {
|
||||
type: "base64",
|
||||
media_type: part.mimeType,
|
||||
data: part.base64,
|
||||
},
|
||||
};
|
||||
});
|
||||
return { role: msg.role, content: parts };
|
||||
});
|
||||
|
||||
return {
|
||||
system: typeof systemMsg?.content === "string" ? systemMsg.content : "",
|
||||
messages: converted,
|
||||
};
|
||||
}
|
||||
|
||||
private buildRequest(messages: ChatMessage[]): string {
|
||||
if (this.isAnthropicModel()) {
|
||||
const { system, messages: converted } = this.buildAnthropicMessages(messages);
|
||||
return JSON.stringify({
|
||||
anthropic_version: "bedrock-2023-05-31",
|
||||
max_tokens: 1024,
|
||||
system,
|
||||
messages: converted,
|
||||
});
|
||||
}
|
||||
|
||||
// For Meta/other models, flatten to single prompt (no multi-turn / image support)
|
||||
const systemContent = messages.find((m) => m.role === "system");
|
||||
const userMessages = messages
|
||||
.filter((m) => m.role === "user")
|
||||
.map((m) =>
|
||||
typeof m.content === "string"
|
||||
? m.content
|
||||
: m.content
|
||||
.filter((p) => p.type === "text")
|
||||
.map((p) => (p as { type: "text"; text: string }).text)
|
||||
.join("\n")
|
||||
);
|
||||
const lastUserContent = userMessages[userMessages.length - 1] ?? "";
|
||||
const sysText =
|
||||
typeof systemContent?.content === "string" ? systemContent.content : "";
|
||||
|
||||
if (this.isMetaModel()) {
|
||||
return JSON.stringify({
|
||||
prompt: `<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n${sysText}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n${lastUserContent}\n\nRespond with ONLY a valid JSON object, no other text.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n`,
|
||||
max_gen_len: 512,
|
||||
temperature: 0.1,
|
||||
});
|
||||
}
|
||||
|
||||
return JSON.stringify({
|
||||
inputText: `${sysText}\n\n${lastUserContent}\n\nRespond with ONLY a valid JSON object.`,
|
||||
textGenerationConfig: {
|
||||
maxTokenCount: 512,
|
||||
temperature: 0.1,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
private extractResponse(responseBody: Record<string, any>): string {
|
||||
if (this.isAnthropicModel()) {
|
||||
return responseBody.content[0].text;
|
||||
}
|
||||
if (this.isMetaModel()) {
|
||||
return responseBody.generation ?? "";
|
||||
}
|
||||
return responseBody.results[0].outputText;
|
||||
}
|
||||
|
||||
async getDecision(messages: ChatMessage[]): Promise<ActionDecision> {
|
||||
const requestBody = this.buildRequest(messages);
|
||||
const command = new InvokeModelCommand({
|
||||
modelId: this.model,
|
||||
body: new TextEncoder().encode(requestBody),
|
||||
contentType: "application/json",
|
||||
accept: "application/json",
|
||||
});
|
||||
|
||||
const response = await this.client.send(command);
|
||||
const responseBody = JSON.parse(new TextDecoder().decode(response.body));
|
||||
const resultText = this.extractResponse(responseBody);
|
||||
return parseJsonResponse(resultText);
|
||||
}
|
||||
|
||||
async *getDecisionStream(messages: ChatMessage[]): AsyncIterable<string> {
|
||||
if (!this.isAnthropicModel()) {
|
||||
// Fallback: non-streaming for non-Anthropic models
|
||||
const decision = await this.getDecision(messages);
|
||||
yield JSON.stringify(decision);
|
||||
return;
|
||||
}
|
||||
|
||||
const { system, messages: converted } = this.buildAnthropicMessages(messages);
|
||||
const requestBody = JSON.stringify({
|
||||
anthropic_version: "bedrock-2023-05-31",
|
||||
max_tokens: 1024,
|
||||
system,
|
||||
messages: converted,
|
||||
});
|
||||
|
||||
const command = new InvokeModelWithResponseStreamCommand({
|
||||
modelId: this.model,
|
||||
body: new TextEncoder().encode(requestBody),
|
||||
contentType: "application/json",
|
||||
});
|
||||
|
||||
const response = await this.client.send(command);
|
||||
if (response.body) {
|
||||
for await (const event of response.body) {
|
||||
if (event.chunk?.bytes) {
|
||||
const data = JSON.parse(new TextDecoder().decode(event.chunk.bytes));
|
||||
if (data.type === "content_block_delta" && data.delta?.text) {
|
||||
yield data.delta.text;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Shared JSON Parsing
|
||||
// ===========================================
|
||||
|
||||
function parseJsonResponse(text: string): ActionDecision {
|
||||
try {
|
||||
return JSON.parse(text);
|
||||
} catch {
|
||||
// Try to extract JSON from markdown code blocks or mixed text
|
||||
const match = text.match(/\{[\s\S]*?\}/);
|
||||
if (match) {
|
||||
try {
|
||||
return JSON.parse(match[0]);
|
||||
} catch {
|
||||
// fall through
|
||||
}
|
||||
}
|
||||
console.log(`Warning: Could not parse LLM response: ${text.slice(0, 200)}`);
|
||||
return { action: "wait", reason: "Failed to parse response, waiting" };
|
||||
}
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Factory
|
||||
// ===========================================
|
||||
|
||||
export function getLlmProvider(): LLMProvider {
|
||||
if (Config.LLM_PROVIDER === "bedrock") {
|
||||
return new BedrockProvider();
|
||||
}
|
||||
if (Config.LLM_PROVIDER === "openrouter") {
|
||||
return new OpenRouterProvider();
|
||||
}
|
||||
return new OpenAIProvider();
|
||||
}
|
||||
129
src/logger.ts
Normal file
129
src/logger.ts
Normal file
@@ -0,0 +1,129 @@
|
||||
/**
|
||||
* Session logging for Android Action Kernel.
|
||||
* Writes incremental .partial.json after each step (crash-safe),
|
||||
* and a final .json summary at session end.
|
||||
*/
|
||||
|
||||
import { mkdirSync, writeFileSync } from "fs";
|
||||
import { join } from "path";
|
||||
import type { ActionDecision } from "./actions.js";
|
||||
|
||||
export interface StepLog {
|
||||
step: number;
|
||||
timestamp: string;
|
||||
foregroundApp: string | null;
|
||||
elementCount: number;
|
||||
screenChanged: boolean;
|
||||
llmDecision: {
|
||||
action: string;
|
||||
reason?: string;
|
||||
coordinates?: [number, number];
|
||||
text?: string;
|
||||
think?: string;
|
||||
plan?: string[];
|
||||
planProgress?: string;
|
||||
};
|
||||
actionResult: {
|
||||
success: boolean;
|
||||
message: string;
|
||||
};
|
||||
llmLatencyMs: number;
|
||||
actionLatencyMs: number;
|
||||
}
|
||||
|
||||
export interface SessionSummary {
|
||||
sessionId: string;
|
||||
goal: string;
|
||||
provider: string;
|
||||
model: string;
|
||||
startTime: string;
|
||||
endTime: string;
|
||||
totalSteps: number;
|
||||
successCount: number;
|
||||
failCount: number;
|
||||
completed: boolean;
|
||||
steps: StepLog[];
|
||||
}
|
||||
|
||||
export class SessionLogger {
|
||||
private sessionId: string;
|
||||
private logDir: string;
|
||||
private steps: StepLog[] = [];
|
||||
private goal: string;
|
||||
private provider: string;
|
||||
private model: string;
|
||||
private startTime: string;
|
||||
|
||||
constructor(logDir: string, goal: string, provider: string, model: string) {
|
||||
this.sessionId = `${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
|
||||
this.logDir = logDir;
|
||||
this.goal = goal;
|
||||
this.provider = provider;
|
||||
this.model = model;
|
||||
this.startTime = new Date().toISOString();
|
||||
|
||||
mkdirSync(this.logDir, { recursive: true });
|
||||
}
|
||||
|
||||
logStep(
|
||||
step: number,
|
||||
foregroundApp: string | null,
|
||||
elementCount: number,
|
||||
screenChanged: boolean,
|
||||
decision: ActionDecision,
|
||||
result: { success: boolean; message: string },
|
||||
llmLatencyMs: number,
|
||||
actionLatencyMs: number
|
||||
): void {
|
||||
const entry: StepLog = {
|
||||
step,
|
||||
timestamp: new Date().toISOString(),
|
||||
foregroundApp,
|
||||
elementCount,
|
||||
screenChanged,
|
||||
llmDecision: {
|
||||
action: decision.action,
|
||||
reason: decision.reason,
|
||||
coordinates: decision.coordinates,
|
||||
text: decision.text,
|
||||
think: decision.think,
|
||||
plan: decision.plan,
|
||||
planProgress: decision.planProgress,
|
||||
},
|
||||
actionResult: {
|
||||
success: result.success,
|
||||
message: result.message,
|
||||
},
|
||||
llmLatencyMs,
|
||||
actionLatencyMs,
|
||||
};
|
||||
this.steps.push(entry);
|
||||
|
||||
// Write partial file after each step (crash-safe)
|
||||
const partialPath = join(this.logDir, `${this.sessionId}.partial.json`);
|
||||
writeFileSync(partialPath, JSON.stringify(this.buildSummary(false), null, 2));
|
||||
}
|
||||
|
||||
finalize(completed: boolean): void {
|
||||
const summary = this.buildSummary(completed);
|
||||
const finalPath = join(this.logDir, `${this.sessionId}.json`);
|
||||
writeFileSync(finalPath, JSON.stringify(summary, null, 2));
|
||||
console.log(`Session log saved: ${finalPath}`);
|
||||
}
|
||||
|
||||
private buildSummary(completed: boolean): SessionSummary {
|
||||
return {
|
||||
sessionId: this.sessionId,
|
||||
goal: this.goal,
|
||||
provider: this.provider,
|
||||
model: this.model,
|
||||
startTime: this.startTime,
|
||||
endTime: new Date().toISOString(),
|
||||
totalSteps: this.steps.length,
|
||||
successCount: this.steps.filter((s) => s.actionResult.success).length,
|
||||
failCount: this.steps.filter((s) => !s.actionResult.success).length,
|
||||
completed,
|
||||
steps: this.steps,
|
||||
};
|
||||
}
|
||||
}
|
||||
249
src/sanitizer.ts
Normal file
249
src/sanitizer.ts
Normal file
@@ -0,0 +1,249 @@
|
||||
/**
|
||||
* XML Sanitizer for Android Action Kernel.
|
||||
* Parses Android Accessibility XML and extracts interactive UI elements
|
||||
* with full state information and parent-child hierarchy context.
|
||||
*/
|
||||
|
||||
import { XMLParser } from "fast-xml-parser";
|
||||
|
||||
export interface UIElement {
|
||||
id: string;
|
||||
text: string;
|
||||
type: string;
|
||||
bounds: string;
|
||||
center: [number, number];
|
||||
size: [number, number];
|
||||
clickable: boolean;
|
||||
editable: boolean;
|
||||
enabled: boolean;
|
||||
checked: boolean;
|
||||
focused: boolean;
|
||||
selected: boolean;
|
||||
scrollable: boolean;
|
||||
longClickable: boolean;
|
||||
password: boolean;
|
||||
hint: string;
|
||||
action: "tap" | "type" | "longpress" | "scroll" | "read";
|
||||
parent: string;
|
||||
depth: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute a hash of element texts/ids for screen state comparison.
|
||||
*/
|
||||
export function computeScreenHash(elements: UIElement[]): string {
|
||||
const parts = elements.map(
|
||||
(e) => `${e.id}|${e.text}|${e.center[0]},${e.center[1]}|${e.enabled}|${e.checked}`
|
||||
);
|
||||
return parts.join(";");
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses Android Accessibility XML and returns a rich list of interactive elements.
|
||||
* Preserves state (enabled, checked, focused) and hierarchy context.
|
||||
*/
|
||||
export function getInteractiveElements(xmlContent: string): UIElement[] {
|
||||
const parser = new XMLParser({
|
||||
ignoreAttributes: false,
|
||||
attributeNamePrefix: "@_",
|
||||
allowBooleanAttributes: true,
|
||||
});
|
||||
|
||||
let parsed: unknown;
|
||||
try {
|
||||
parsed = parser.parse(xmlContent);
|
||||
} catch {
|
||||
console.log("Warning: Error parsing XML. The screen might be loading.");
|
||||
return [];
|
||||
}
|
||||
|
||||
const elements: UIElement[] = [];
|
||||
|
||||
function walk(node: any, parentLabel: string, depth: number): void {
|
||||
if (!node || typeof node !== "object") return;
|
||||
|
||||
if (node["@_bounds"]) {
|
||||
const isClickable = node["@_clickable"] === "true";
|
||||
const isLongClickable = node["@_long-clickable"] === "true";
|
||||
const isScrollable = node["@_scrollable"] === "true";
|
||||
const isEnabled = node["@_enabled"] !== "false"; // default true
|
||||
const isChecked = node["@_checked"] === "true";
|
||||
const isFocused = node["@_focused"] === "true";
|
||||
const isSelected = node["@_selected"] === "true";
|
||||
const isPassword = node["@_password"] === "true";
|
||||
|
||||
const elementClass = node["@_class"] ?? "";
|
||||
const isEditable =
|
||||
elementClass.includes("EditText") ||
|
||||
elementClass.includes("AutoCompleteTextView") ||
|
||||
node["@_editable"] === "true";
|
||||
|
||||
const text: string = node["@_text"] ?? "";
|
||||
const desc: string = node["@_content-desc"] ?? "";
|
||||
const resourceId: string = node["@_resource-id"] ?? "";
|
||||
const hint: string = node["@_hint"] ?? "";
|
||||
|
||||
// Build a label for this node to use as parent context for children
|
||||
const typeName = elementClass.split(".").pop() ?? "";
|
||||
const nodeLabel = text || desc || resourceId.split("/").pop() || typeName;
|
||||
|
||||
// Determine if this element should be included
|
||||
const isInteractive = isClickable || isEditable || isLongClickable || isScrollable;
|
||||
const hasContent = !!(text || desc);
|
||||
|
||||
if (isInteractive || hasContent) {
|
||||
const bounds: string = node["@_bounds"];
|
||||
try {
|
||||
const coords = bounds
|
||||
.replace("][", ",")
|
||||
.replace("[", "")
|
||||
.replace("]", "")
|
||||
.split(",")
|
||||
.map(Number);
|
||||
|
||||
const [x1, y1, x2, y2] = coords;
|
||||
const centerX = Math.floor((x1 + x2) / 2);
|
||||
const centerY = Math.floor((y1 + y2) / 2);
|
||||
const width = x2 - x1;
|
||||
const height = y2 - y1;
|
||||
|
||||
// Skip zero-size elements (invisible)
|
||||
if (width <= 0 || height <= 0) {
|
||||
// still walk children
|
||||
} else {
|
||||
let suggestedAction: UIElement["action"];
|
||||
if (isEditable) suggestedAction = "type";
|
||||
else if (isLongClickable && !isClickable) suggestedAction = "longpress";
|
||||
else if (isScrollable && !isClickable) suggestedAction = "scroll";
|
||||
else if (isClickable) suggestedAction = "tap";
|
||||
else suggestedAction = "read";
|
||||
|
||||
elements.push({
|
||||
id: resourceId,
|
||||
text: text || desc,
|
||||
type: typeName,
|
||||
bounds,
|
||||
center: [centerX, centerY],
|
||||
size: [width, height],
|
||||
clickable: isClickable,
|
||||
editable: isEditable,
|
||||
enabled: isEnabled,
|
||||
checked: isChecked,
|
||||
focused: isFocused,
|
||||
selected: isSelected,
|
||||
scrollable: isScrollable,
|
||||
longClickable: isLongClickable,
|
||||
password: isPassword,
|
||||
hint: hint,
|
||||
action: suggestedAction,
|
||||
parent: parentLabel,
|
||||
depth,
|
||||
});
|
||||
}
|
||||
} catch {
|
||||
// Skip malformed bounds
|
||||
}
|
||||
}
|
||||
|
||||
// Recurse with updated parent label
|
||||
walkChildren(node, nodeLabel, depth + 1);
|
||||
return;
|
||||
}
|
||||
|
||||
// No bounds on this node — just recurse
|
||||
walkChildren(node, parentLabel, depth);
|
||||
}
|
||||
|
||||
function walkChildren(node: any, parentLabel: string, depth: number): void {
|
||||
if (node.node) {
|
||||
const children = Array.isArray(node.node) ? node.node : [node.node];
|
||||
for (const child of children) {
|
||||
walk(child, parentLabel, depth);
|
||||
}
|
||||
}
|
||||
if (node.hierarchy) {
|
||||
walk(node.hierarchy, parentLabel, depth);
|
||||
}
|
||||
}
|
||||
|
||||
walk(parsed, "root", 0);
|
||||
return elements;
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Smart Element Filtering (Phase 2A)
|
||||
// ===========================================
|
||||
|
||||
/**
|
||||
* Compact representation sent to the LLM — only essential fields.
|
||||
* Non-default flags are included conditionally to minimize tokens.
|
||||
*/
|
||||
export interface CompactUIElement {
|
||||
text: string;
|
||||
center: [number, number];
|
||||
action: UIElement["action"];
|
||||
// Only included when non-default
|
||||
enabled?: false;
|
||||
checked?: true;
|
||||
focused?: true;
|
||||
hint?: string;
|
||||
editable?: true;
|
||||
scrollable?: true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Strips a full UIElement to its compact form, omitting default-valued flags.
|
||||
*/
|
||||
export function compactElement(el: UIElement): CompactUIElement {
|
||||
const compact: CompactUIElement = {
|
||||
text: el.text,
|
||||
center: el.center,
|
||||
action: el.action,
|
||||
};
|
||||
if (!el.enabled) compact.enabled = false;
|
||||
if (el.checked) compact.checked = true;
|
||||
if (el.focused) compact.focused = true;
|
||||
if (el.hint) compact.hint = el.hint;
|
||||
if (el.editable) compact.editable = true;
|
||||
if (el.scrollable) compact.scrollable = true;
|
||||
return compact;
|
||||
}
|
||||
|
||||
/**
|
||||
* Scores an element for relevance to the LLM.
|
||||
*/
|
||||
function scoreElement(el: UIElement): number {
|
||||
let score = 0;
|
||||
if (el.enabled) score += 10;
|
||||
if (el.editable) score += 8;
|
||||
if (el.focused) score += 6;
|
||||
if (el.clickable || el.longClickable) score += 5;
|
||||
if (el.text) score += 3;
|
||||
return score;
|
||||
}
|
||||
|
||||
/**
|
||||
* Deduplicates elements by center coordinates (within tolerance),
|
||||
* scores them, and returns the top N as compact elements.
|
||||
*/
|
||||
export function filterElements(
|
||||
elements: UIElement[],
|
||||
limit: number
|
||||
): CompactUIElement[] {
|
||||
// Deduplicate by center coordinates (5px tolerance)
|
||||
const seen = new Map<string, UIElement>();
|
||||
for (const el of elements) {
|
||||
const bucketX = Math.round(el.center[0] / 5) * 5;
|
||||
const bucketY = Math.round(el.center[1] / 5) * 5;
|
||||
const key = `${bucketX},${bucketY}`;
|
||||
const existing = seen.get(key);
|
||||
if (!existing || scoreElement(el) > scoreElement(existing)) {
|
||||
seen.set(key, el);
|
||||
}
|
||||
}
|
||||
|
||||
// Score, sort descending, take top N
|
||||
const deduped = Array.from(seen.values());
|
||||
deduped.sort((a, b) => scoreElement(b) - scoreElement(a));
|
||||
return deduped.slice(0, limit).map(compactElement);
|
||||
}
|
||||
Reference in New Issue
Block a user