Flatten project structure: move android-action-kernel/ to root

Removes the unnecessary nesting — all source, config, and docs now live
at the project root for simpler paths and commands.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Sanju Sivalingam
2026-02-06 16:02:40 +05:30
parent 610fd04818
commit 879509aebc
16 changed files with 862 additions and 7 deletions

390
src/actions.ts Normal file
View File

@@ -0,0 +1,390 @@
/**
* Action execution module for Android Action Kernel.
* Handles all ADB commands for interacting with Android devices.
*
* Supported actions:
* tap, type, enter, swipe, home, back, wait, done,
* longpress, screenshot, launch, clear, clipboard_get, clipboard_set, shell
*/
import { Config } from "./config.js";
import {
KEYCODE_ENTER,
KEYCODE_HOME,
KEYCODE_BACK,
KEYCODE_DEL,
KEYCODE_MOVE_HOME,
KEYCODE_MOVE_END,
SWIPE_COORDS,
SWIPE_DURATION_MS,
LONG_PRESS_DURATION_MS,
DEVICE_SCREENSHOT_PATH,
LOCAL_SCREENSHOT_PATH,
computeSwipeCoords,
} from "./constants.js";
export interface ActionDecision {
action: string;
coordinates?: [number, number];
text?: string;
direction?: string;
reason?: string;
// launch action
package?: string;
activity?: string;
uri?: string;
extras?: Record<string, string>;
// shell action
command?: string;
// screenshot action
filename?: string;
// planning fields (Phase 4B)
think?: string;
plan?: string[];
planProgress?: string;
}
export interface ActionResult {
success: boolean;
message: string;
data?: string;
}
/**
* Executes a shell command via ADB with retry support.
*/
export function runAdbCommand(command: string[], retries = Config.MAX_RETRIES): string {
for (let attempt = 0; attempt <= retries; attempt++) {
const result = Bun.spawnSync([Config.ADB_PATH, ...command], {
stdout: "pipe",
stderr: "pipe",
});
const stdout = result.stdout.toString().trim();
const stderr = result.stderr.toString().trim();
if (stderr && stderr.toLowerCase().includes("error")) {
if (attempt < retries) {
const delay = Math.pow(2, attempt) * 1000;
console.log(`ADB Error (attempt ${attempt + 1}/${retries + 1}): ${stderr}`);
console.log(`Retrying in ${delay / 1000}s...`);
Bun.sleepSync(delay);
continue;
}
console.log(`ADB Error (all retries exhausted): ${stderr}`);
}
return stdout;
}
return "";
}
// ===========================================
// Device Intelligence (Phase 1)
// ===========================================
/** Module-level dynamic swipe coords, set by initDeviceContext() */
let dynamicSwipeCoords: Record<string, [number, number, number, number]> | null = null;
/**
* Detects the connected device's screen resolution via ADB.
* Returns [width, height] or null on failure.
*/
export function getScreenResolution(): [number, number] | null {
try {
const output = runAdbCommand(["shell", "wm", "size"]);
// Try "Override size:" first, then "Physical size:"
const overrideMatch = output.match(/Override size:\s*(\d+)x(\d+)/);
if (overrideMatch) {
return [parseInt(overrideMatch[1], 10), parseInt(overrideMatch[2], 10)];
}
const physicalMatch = output.match(/Physical size:\s*(\d+)x(\d+)/);
if (physicalMatch) {
return [parseInt(physicalMatch[1], 10), parseInt(physicalMatch[2], 10)];
}
} catch {
console.log("Warning: Could not detect screen resolution.");
}
return null;
}
/**
* Detects the currently running foreground app.
* Returns "package/activity" or null on failure.
*/
export function getForegroundApp(): string | null {
try {
const output = runAdbCommand([
"shell", "dumpsys", "activity", "activities",
]);
// Match mResumedActivity line
const match = output.match(/mResumedActivity.*?(\S+\/\S+)/);
if (match) {
return match[1].replace("}", "");
}
} catch {
// ignore
}
return null;
}
/**
* Stores dynamic swipe coordinates based on detected resolution.
* Must be called once at startup.
*/
export function initDeviceContext(resolution: [number, number]): void {
dynamicSwipeCoords = computeSwipeCoords(resolution[0], resolution[1]);
}
/** Returns dynamic swipe coords if set, otherwise falls back to hardcoded defaults. */
function getSwipeCoords(): Record<string, [number, number, number, number]> {
return dynamicSwipeCoords ?? SWIPE_COORDS;
}
/**
* Executes the action decided by the LLM. Returns a result for the kernel to track.
*/
export function executeAction(action: ActionDecision): ActionResult {
switch (action.action) {
case "tap":
return executeTap(action);
case "type":
return executeType(action);
case "enter":
return executeEnter();
case "swipe":
return executeSwipe(action);
case "home":
return executeHome();
case "back":
return executeBack();
case "wait":
return executeWait();
case "done":
return executeDone(action);
case "longpress":
return executeLongPress(action);
case "screenshot":
return executeScreenshot(action);
case "launch":
return executeLaunch(action);
case "clear":
return executeClear();
case "clipboard_get":
return executeClipboardGet();
case "clipboard_set":
return executeClipboardSet(action);
case "shell":
return executeShell(action);
default:
console.log(`Warning: Unknown action: ${action.action}`);
return { success: false, message: `Unknown action: ${action.action}` };
}
}
// ===========================================
// Original actions (enhanced)
// ===========================================
function executeTap(action: ActionDecision): ActionResult {
const [x, y] = action.coordinates ?? [0, 0];
console.log(`Tapping: (${x}, ${y})`);
runAdbCommand(["shell", "input", "tap", String(x), String(y)]);
return { success: true, message: `Tapped (${x}, ${y})` };
}
function executeType(action: ActionDecision): ActionResult {
const text = action.text ?? "";
if (!text) return { success: false, message: "No text to type" };
// ADB requires %s for spaces, escape special shell characters
const escapedText = text
.replaceAll("\\", "\\\\")
.replaceAll("\"", "\\\"")
.replaceAll("'", "\\'")
.replaceAll(" ", "%s")
.replaceAll("&", "\\&")
.replaceAll("|", "\\|")
.replaceAll(";", "\\;")
.replaceAll("(", "\\(")
.replaceAll(")", "\\)")
.replaceAll("<", "\\<")
.replaceAll(">", "\\>");
console.log(`Typing: ${text}`);
runAdbCommand(["shell", "input", "text", escapedText]);
return { success: true, message: `Typed "${text}"` };
}
function executeEnter(): ActionResult {
console.log("Pressing Enter");
runAdbCommand(["shell", "input", "keyevent", KEYCODE_ENTER]);
return { success: true, message: "Pressed Enter" };
}
function executeSwipe(action: ActionDecision): ActionResult {
const direction = action.direction ?? "up";
const swipeCoords = getSwipeCoords();
const coords = swipeCoords[direction] ?? swipeCoords["up"];
console.log(`Swiping ${direction}`);
runAdbCommand([
"shell", "input", "swipe",
String(coords[0]), String(coords[1]),
String(coords[2]), String(coords[3]),
SWIPE_DURATION_MS,
]);
return { success: true, message: `Swiped ${direction}` };
}
function executeHome(): ActionResult {
console.log("Going Home");
runAdbCommand(["shell", "input", "keyevent", KEYCODE_HOME]);
return { success: true, message: "Went to home screen" };
}
function executeBack(): ActionResult {
console.log("Going Back");
runAdbCommand(["shell", "input", "keyevent", KEYCODE_BACK]);
return { success: true, message: "Went back" };
}
function executeWait(): ActionResult {
console.log("Waiting...");
Bun.sleepSync(2000);
return { success: true, message: "Waited 2s" };
}
function executeDone(action: ActionDecision): ActionResult {
console.log(`Goal Achieved: ${action.reason ?? "Task complete"}`);
return { success: true, message: "done" };
}
// ===========================================
// New actions
// ===========================================
/**
* Long press at coordinates (opens context menus, triggers drag mode, etc.)
*/
function executeLongPress(action: ActionDecision): ActionResult {
const [x, y] = action.coordinates ?? [0, 0];
console.log(`Long pressing: (${x}, ${y})`);
// A swipe from the same point to the same point with long duration = long press
runAdbCommand([
"shell", "input", "swipe",
String(x), String(y), String(x), String(y),
LONG_PRESS_DURATION_MS,
]);
return { success: true, message: `Long pressed (${x}, ${y})` };
}
/**
* Captures a screenshot and saves it locally.
*/
function executeScreenshot(action: ActionDecision): ActionResult {
const filename = action.filename ?? LOCAL_SCREENSHOT_PATH;
console.log(`Taking screenshot → ${filename}`);
runAdbCommand(["shell", "screencap", "-p", DEVICE_SCREENSHOT_PATH]);
runAdbCommand(["pull", DEVICE_SCREENSHOT_PATH, filename]);
return { success: true, message: `Screenshot saved to ${filename}`, data: filename };
}
/**
* Launches an app by package name, activity, or URI intent.
*
* Examples the LLM can produce:
* { action: "launch", package: "com.whatsapp" }
* { action: "launch", package: "com.whatsapp", activity: ".HomeActivity" }
* { action: "launch", uri: "https://maps.google.com/?q=pizza+near+me" }
* { action: "launch", package: "com.whatsapp", uri: "content://media/external/images/1",
* extras: { "android.intent.extra.TEXT": "Check this out" } }
*/
function executeLaunch(action: ActionDecision): ActionResult {
const args: string[] = ["shell", "am", "start"];
if (action.uri) {
args.push("-a", "android.intent.action.VIEW");
args.push("-d", action.uri);
}
if (action.package && action.activity) {
args.push("-n", `${action.package}/${action.activity}`);
} else if (action.package) {
// Launch the default activity for the package
const launchResult = runAdbCommand([
"shell", "monkey", "-p", action.package, "-c",
"android.intent.category.LAUNCHER", "1",
]);
console.log(`Launching: ${action.package}`);
return { success: true, message: `Launched ${action.package}`, data: launchResult };
}
// Attach intent extras
if (action.extras) {
for (const [key, value] of Object.entries(action.extras)) {
args.push("--es", key, value);
}
}
const label = action.package ?? action.uri ?? "intent";
console.log(`Launching: ${label}`);
const result = runAdbCommand(args);
return { success: true, message: `Launched ${label}`, data: result };
}
/**
* Clears the currently focused text field.
* Selects all text then deletes it.
*/
function executeClear(): ActionResult {
console.log("Clearing text field");
// Move to end of field
runAdbCommand(["shell", "input", "keyevent", KEYCODE_MOVE_END]);
// Select all: Shift+Home
runAdbCommand(["shell", "input", "keyevent", "--longpress", KEYCODE_MOVE_HOME]);
// Delete selected text
runAdbCommand(["shell", "input", "keyevent", KEYCODE_DEL]);
return { success: true, message: "Cleared text field" };
}
/**
* Reads the current clipboard contents.
*/
function executeClipboardGet(): ActionResult {
console.log("Reading clipboard");
// Use am broadcast to get clipboard via a helper or service log
// On Android 10+, direct clipboard access via ADB is restricted.
// Workaround: dump the clipboard service log
const result = runAdbCommand(["shell", "cmd", "clipboard", "get-text"]);
if (result) {
console.log(`Clipboard: ${result.slice(0, 100)}`);
return { success: true, message: `Clipboard: ${result}`, data: result };
}
// Fallback for older Android versions
const fallback = runAdbCommand([
"shell", "service", "call", "clipboard", "2", "i32", "1",
]);
return { success: true, message: `Clipboard (raw): ${fallback}`, data: fallback };
}
/**
* Sets the clipboard to the given text.
*/
function executeClipboardSet(action: ActionDecision): ActionResult {
const text = action.text ?? "";
if (!text) return { success: false, message: "No text to set on clipboard" };
console.log(`Setting clipboard: ${text.slice(0, 50)}...`);
runAdbCommand(["shell", "cmd", "clipboard", "set-text", text]);
return { success: true, message: `Clipboard set to "${text.slice(0, 50)}"` };
}
/**
* Runs an arbitrary ADB shell command. Use sparingly for edge cases.
*/
function executeShell(action: ActionDecision): ActionResult {
const cmd = action.command ?? "";
if (!cmd) return { success: false, message: "No command provided" };
console.log(`Shell: ${cmd}`);
const result = runAdbCommand(["shell", ...cmd.split(" ")]);
return { success: true, message: `Shell output: ${result.slice(0, 200)}`, data: result };
}

99
src/config.ts Normal file
View File

@@ -0,0 +1,99 @@
/**
* Configuration management for Android Action Kernel.
* Bun natively loads .env files — no dotenv needed.
*/
import {
DEVICE_DUMP_PATH,
LOCAL_DUMP_PATH,
DEVICE_SCREENSHOT_PATH,
LOCAL_SCREENSHOT_PATH,
DEFAULT_MAX_STEPS,
DEFAULT_STEP_DELAY,
DEFAULT_GROQ_MODEL,
DEFAULT_OPENAI_MODEL,
DEFAULT_BEDROCK_MODEL,
DEFAULT_MAX_RETRIES,
DEFAULT_STUCK_THRESHOLD,
DEFAULT_MAX_ELEMENTS,
DEFAULT_LOG_DIR,
DEFAULT_VISION_MODE,
DEFAULT_MAX_HISTORY_STEPS,
DEFAULT_STREAMING_ENABLED,
type VisionMode,
} from "./constants.js";
function env(key: string, fallback = ""): string {
return process.env[key] ?? fallback;
}
export const Config = {
// ADB Configuration
ADB_PATH: env("ADB_PATH", "adb"),
SCREEN_DUMP_PATH: DEVICE_DUMP_PATH,
LOCAL_DUMP_PATH: LOCAL_DUMP_PATH,
DEVICE_SCREENSHOT_PATH: DEVICE_SCREENSHOT_PATH,
LOCAL_SCREENSHOT_PATH: LOCAL_SCREENSHOT_PATH,
// Agent Configuration
MAX_STEPS: parseInt(env("MAX_STEPS", String(DEFAULT_MAX_STEPS)), 10),
STEP_DELAY: parseFloat(env("STEP_DELAY", String(DEFAULT_STEP_DELAY))),
MAX_RETRIES: parseInt(env("MAX_RETRIES", String(DEFAULT_MAX_RETRIES)), 10),
STUCK_THRESHOLD: parseInt(env("STUCK_THRESHOLD", String(DEFAULT_STUCK_THRESHOLD)), 10),
// Vision mode: "off" | "fallback" (only when tree empty) | "always" (every step)
VISION_MODE: (env("VISION_MODE", DEFAULT_VISION_MODE) as VisionMode),
// Smart element filtering
MAX_ELEMENTS: parseInt(env("MAX_ELEMENTS", String(DEFAULT_MAX_ELEMENTS)), 10),
// Session logging
LOG_DIR: env("LOG_DIR", DEFAULT_LOG_DIR),
// Multi-turn memory
MAX_HISTORY_STEPS: parseInt(env("MAX_HISTORY_STEPS", String(DEFAULT_MAX_HISTORY_STEPS)), 10),
// Streaming responses
STREAMING_ENABLED: env("STREAMING_ENABLED", String(DEFAULT_STREAMING_ENABLED)) === "true",
// LLM Provider: "groq", "openai", "bedrock", or "openrouter"
LLM_PROVIDER: env("LLM_PROVIDER", "groq"),
// Groq Configuration
GROQ_API_KEY: env("GROQ_API_KEY"),
GROQ_MODEL: env("GROQ_MODEL", DEFAULT_GROQ_MODEL),
// OpenAI Configuration
OPENAI_API_KEY: env("OPENAI_API_KEY"),
OPENAI_MODEL: env("OPENAI_MODEL", DEFAULT_OPENAI_MODEL),
// AWS Bedrock Configuration
AWS_REGION: env("AWS_REGION", "us-east-1"),
BEDROCK_MODEL: env("BEDROCK_MODEL", DEFAULT_BEDROCK_MODEL),
// OpenRouter Configuration (via Vercel AI SDK)
OPENROUTER_API_KEY: env("OPENROUTER_API_KEY"),
OPENROUTER_MODEL: env("OPENROUTER_MODEL", "anthropic/claude-3.5-sonnet"),
getModel(): string {
const provider = Config.LLM_PROVIDER;
if (provider === "groq") return Config.GROQ_MODEL;
if (provider === "bedrock") return Config.BEDROCK_MODEL;
if (provider === "openrouter") return Config.OPENROUTER_MODEL;
return Config.OPENAI_MODEL;
},
validate(): void {
const provider = Config.LLM_PROVIDER;
if (provider === "groq" && !Config.GROQ_API_KEY) {
throw new Error("GROQ_API_KEY is required when using Groq provider");
}
if (provider === "openai" && !Config.OPENAI_API_KEY) {
throw new Error("OPENAI_API_KEY is required when using OpenAI provider");
}
if (provider === "openrouter" && !Config.OPENROUTER_API_KEY) {
throw new Error("OPENROUTER_API_KEY is required when using OpenRouter provider");
}
// Bedrock uses AWS credential chain, no explicit validation needed
},
};

118
src/constants.ts Normal file
View File

@@ -0,0 +1,118 @@
/**
* Constants for Android Action Kernel.
* All magic strings, URLs, and fixed values in one place.
*/
// ===========================================
// API Endpoints
// ===========================================
export const GROQ_API_BASE_URL = "https://api.groq.com/openai/v1";
// ===========================================
// ADB Key Codes
// ===========================================
export const KEYCODE_ENTER = "66";
export const KEYCODE_HOME = "KEYCODE_HOME";
export const KEYCODE_BACK = "KEYCODE_BACK";
export const KEYCODE_DEL = "67";
export const KEYCODE_FORWARD_DEL = "112";
export const KEYCODE_MOVE_HOME = "122";
export const KEYCODE_MOVE_END = "123";
export const KEYCODE_MENU = "82";
export const KEYCODE_TAB = "61";
export const KEYCODE_ESCAPE = "111";
export const KEYCODE_DPAD_UP = "19";
export const KEYCODE_DPAD_DOWN = "20";
export const KEYCODE_DPAD_LEFT = "21";
export const KEYCODE_DPAD_RIGHT = "22";
export const KEYCODE_VOLUME_UP = "24";
export const KEYCODE_VOLUME_DOWN = "25";
export const KEYCODE_POWER = "26";
// ===========================================
// Default Screen Coordinates (for swipe actions)
// Adjust based on target device resolution
// ===========================================
export const SCREEN_CENTER_X = 540;
export const SCREEN_CENTER_Y = 1200;
// Swipe coordinates: [start_x, start_y, end_x, end_y]
// These are the fallback values for 1080x2400 screens
export const SWIPE_COORDS: Record<string, [number, number, number, number]> = {
up: [SCREEN_CENTER_X, 1500, SCREEN_CENTER_X, 500],
down: [SCREEN_CENTER_X, 500, SCREEN_CENTER_X, 1500],
left: [800, SCREEN_CENTER_Y, 200, SCREEN_CENTER_Y],
right: [200, SCREEN_CENTER_Y, 800, SCREEN_CENTER_Y],
};
/**
* Derives swipe coordinates from actual screen dimensions using ratios
* from the hardcoded 1080x2400 reference values.
*/
export function computeSwipeCoords(
width: number,
height: number
): Record<string, [number, number, number, number]> {
const cx = Math.floor(width / 2);
const cy = Math.floor(height / 2);
// Vertical swipe: from 62.5% to 20.8% of height (mirrors 1500→500 on 2400h)
const vTop = Math.floor(height * 0.208);
const vBottom = Math.floor(height * 0.625);
// Horizontal swipe: from 74% to 18.5% of width (mirrors 800→200 on 1080w)
const hLeft = Math.floor(width * 0.185);
const hRight = Math.floor(width * 0.741);
return {
up: [cx, vBottom, cx, vTop],
down: [cx, vTop, cx, vBottom],
left: [hRight, cy, hLeft, cy],
right: [hLeft, cy, hRight, cy],
};
}
export const SWIPE_DURATION_MS = "300";
export const LONG_PRESS_DURATION_MS = "1000";
// ===========================================
// Default Models
// ===========================================
export const DEFAULT_GROQ_MODEL = "llama-3.3-70b-versatile";
export const DEFAULT_OPENAI_MODEL = "gpt-4o";
export const DEFAULT_BEDROCK_MODEL = "us.meta.llama3-3-70b-instruct-v1:0";
export const DEFAULT_OPENROUTER_MODEL = "anthropic/claude-3.5-sonnet";
// ===========================================
// Bedrock Model Identifiers
// ===========================================
export const BEDROCK_ANTHROPIC_MODELS = ["anthropic"];
export const BEDROCK_META_MODELS = ["meta", "llama"];
// ===========================================
// File Paths
// ===========================================
export const DEVICE_DUMP_PATH = "/sdcard/window_dump.xml";
export const LOCAL_DUMP_PATH = "window_dump.xml";
export const DEVICE_SCREENSHOT_PATH = "/sdcard/kernel_screenshot.png";
export const LOCAL_SCREENSHOT_PATH = "kernel_screenshot.png";
// ===========================================
// Agent Defaults
// ===========================================
export const DEFAULT_MAX_STEPS = 30;
export const DEFAULT_STEP_DELAY = 2.0;
export const DEFAULT_MAX_RETRIES = 3;
export const DEFAULT_STUCK_THRESHOLD = 3;
export const DEFAULT_VISION_ENABLED = true;
// Phase 2: Context Quality
export const DEFAULT_MAX_ELEMENTS = 40;
export const DEFAULT_LOG_DIR = "logs";
// Phase 3: Vision Mode
export type VisionMode = "off" | "fallback" | "always";
export const DEFAULT_VISION_MODE: VisionMode = "fallback";
// Phase 4: Multi-turn Memory
export const DEFAULT_MAX_HISTORY_STEPS = 10;
// Phase 5: Streaming
export const DEFAULT_STREAMING_ENABLED = true;

416
src/kernel.ts Normal file
View File

@@ -0,0 +1,416 @@
/**
* Android Action Kernel - Main Agent Loop (TypeScript/Bun Edition)
*
* An AI agent that controls Android devices through the accessibility API.
* Uses LLMs to make decisions based on screen context.
*
* Features:
* - Perception -> Reasoning -> Action loop
* - Screen state diffing (stuck loop detection)
* - Error recovery with retries
* - Vision fallback & always-on multimodal screenshots
* - Dynamic early exit on goal completion
* - Smart element filtering (compact JSON, top-N scoring)
* - Multi-turn conversation memory
* - Multi-step planning (think/plan/planProgress)
* - Streaming LLM responses
* - Session logging with crash-safe partial writes
* - Auto-detect screen resolution & foreground app
* - 15 actions: tap, type, enter, swipe, home, back, wait, done,
* longpress, screenshot, launch, clear, clipboard_get, clipboard_set, shell
*
* Usage:
* bun run src/kernel.ts
*/
import { existsSync, readFileSync } from "fs";
import { Config } from "./config.js";
import {
executeAction,
runAdbCommand,
getScreenResolution,
getForegroundApp,
initDeviceContext,
type ActionDecision,
type ActionResult,
} from "./actions.js";
import {
getLlmProvider,
trimMessages,
SYSTEM_PROMPT,
type LLMProvider,
type ChatMessage,
type ContentPart,
} from "./llm-providers.js";
import {
getInteractiveElements,
computeScreenHash,
filterElements,
type UIElement,
} from "./sanitizer.js";
import {
DEVICE_SCREENSHOT_PATH,
LOCAL_SCREENSHOT_PATH,
} from "./constants.js";
import { SessionLogger } from "./logger.js";
// ===========================================
// Screen Perception
// ===========================================
interface ScreenState {
elements: UIElement[];
compactJson: string;
}
/**
* Dumps the current UI XML and returns parsed elements + compact filtered JSON for the LLM.
*/
function getScreenState(): ScreenState {
try {
runAdbCommand(["shell", "uiautomator", "dump", Config.SCREEN_DUMP_PATH]);
runAdbCommand(["pull", Config.SCREEN_DUMP_PATH, Config.LOCAL_DUMP_PATH]);
} catch {
console.log("Warning: ADB screen capture failed.");
return { elements: [], compactJson: "Error: Could not capture screen." };
}
if (!existsSync(Config.LOCAL_DUMP_PATH)) {
return { elements: [], compactJson: "Error: Could not capture screen." };
}
const xmlContent = readFileSync(Config.LOCAL_DUMP_PATH, "utf-8");
const elements = getInteractiveElements(xmlContent);
const compact = filterElements(elements, Config.MAX_ELEMENTS);
return { elements, compactJson: JSON.stringify(compact) };
}
/**
* Captures a screenshot and returns the base64-encoded PNG, or null on failure.
*/
function captureScreenshotBase64(): string | null {
try {
runAdbCommand(["shell", "screencap", "-p", DEVICE_SCREENSHOT_PATH]);
runAdbCommand(["pull", DEVICE_SCREENSHOT_PATH, LOCAL_SCREENSHOT_PATH]);
if (existsSync(LOCAL_SCREENSHOT_PATH)) {
const buffer = readFileSync(LOCAL_SCREENSHOT_PATH);
return Buffer.from(buffer).toString("base64");
}
} catch {
console.log("Warning: Screenshot capture failed.");
}
return null;
}
// ===========================================
// Screen State Diffing
// ===========================================
interface ScreenDiff {
changed: boolean;
addedTexts: string[];
removedTexts: string[];
summary: string;
}
function diffScreenState(
prevElements: UIElement[],
currElements: UIElement[]
): ScreenDiff {
const prevTexts = new Set(prevElements.map((e) => e.text).filter(Boolean));
const currTexts = new Set(currElements.map((e) => e.text).filter(Boolean));
const addedTexts = [...currTexts].filter((t) => !prevTexts.has(t));
const removedTexts = [...prevTexts].filter((t) => !currTexts.has(t));
const prevHash = computeScreenHash(prevElements);
const currHash = computeScreenHash(currElements);
const changed = prevHash !== currHash;
let summary = "";
if (!changed) {
summary = "Screen has NOT changed since last action.";
} else {
const parts: string[] = [];
if (addedTexts.length > 0) {
parts.push(`New on screen: ${addedTexts.slice(0, 5).join(", ")}`);
}
if (removedTexts.length > 0) {
parts.push(`Gone from screen: ${removedTexts.slice(0, 5).join(", ")}`);
}
summary = parts.join(". ") || "Screen layout changed.";
}
return { changed, addedTexts, removedTexts, summary };
}
// ===========================================
// Streaming LLM Consumer
// ===========================================
async function getDecisionStreaming(
llm: LLMProvider,
messages: ChatMessage[]
): Promise<ActionDecision> {
if (!Config.STREAMING_ENABLED || !llm.capabilities.supportsStreaming || !llm.getDecisionStream) {
return llm.getDecision(messages);
}
let accumulated = "";
process.stdout.write("Thinking");
for await (const chunk of llm.getDecisionStream(messages)) {
accumulated += chunk;
process.stdout.write(".");
}
process.stdout.write("\n");
return parseJsonResponse(accumulated);
}
/** Simple JSON parser with markdown fallback (duplicated from llm-providers for streaming path) */
function parseJsonResponse(text: string): ActionDecision {
try {
return JSON.parse(text);
} catch {
const match = text.match(/\{[\s\S]*?\}/);
if (match) {
try {
return JSON.parse(match[0]);
} catch {
// fall through
}
}
console.log(`Warning: Could not parse streamed response: ${text.slice(0, 200)}`);
return { action: "wait", reason: "Failed to parse response, waiting" };
}
}
// ===========================================
// Main Agent Loop
// ===========================================
async function runAgent(goal: string, maxSteps?: number): Promise<void> {
const steps = maxSteps ?? Config.MAX_STEPS;
// Phase 1A: Auto-detect screen resolution
const resolution = getScreenResolution();
if (resolution) {
initDeviceContext(resolution);
console.log(`Screen resolution: ${resolution[0]}x${resolution[1]}`);
} else {
console.log("Screen resolution: using default 1080x2400 swipe coords");
}
console.log("Android Action Kernel Started");
console.log(`Goal: ${goal}`);
console.log(`Provider: ${Config.LLM_PROVIDER} (${Config.getModel()})`);
console.log(`Max steps: ${steps} | Step delay: ${Config.STEP_DELAY}s`);
console.log(`Vision: ${Config.VISION_MODE} | Streaming: ${Config.STREAMING_ENABLED}`);
console.log(`Max elements: ${Config.MAX_ELEMENTS} | History: ${Config.MAX_HISTORY_STEPS} steps`);
const llm = getLlmProvider();
// Phase 2B: Session logging
const logger = new SessionLogger(
Config.LOG_DIR,
goal,
Config.LLM_PROVIDER,
Config.getModel()
);
// Phase 4A: Multi-turn conversation memory
const messages: ChatMessage[] = [
{ role: "system", content: SYSTEM_PROMPT },
];
let prevElements: UIElement[] = [];
let stuckCount = 0;
for (let step = 0; step < steps; step++) {
console.log(`\n--- Step ${step + 1}/${steps} ---`);
// 1. Perception: Capture screen state
console.log("Scanning screen...");
const { elements, compactJson: screenContext } = getScreenState();
// 1B. Foreground app detection
const foregroundApp = getForegroundApp();
if (foregroundApp) {
console.log(`Foreground: ${foregroundApp}`);
}
// 2. Screen diff: detect stuck loops
let diffContext = "";
let screenChanged = true;
if (step > 0) {
const diff = diffScreenState(prevElements, elements);
screenChanged = diff.changed;
diffContext = `\n\nSCREEN_CHANGE: ${diff.summary}`;
if (!diff.changed) {
stuckCount++;
console.log(
`Warning: Screen unchanged for ${stuckCount} step(s).`
);
if (stuckCount >= Config.STUCK_THRESHOLD) {
console.log(
`Stuck for ${stuckCount} steps. Injecting recovery hint.`
);
diffContext +=
`\nWARNING: You have been stuck for ${stuckCount} steps. ` +
`The screen is NOT changing. Try a DIFFERENT action: ` +
`swipe to scroll, press back, go home, or launch a different app.` +
`\nYour plan is not working. Create a NEW plan with a different approach.`;
}
} else {
stuckCount = 0;
}
}
prevElements = elements;
// 3. Vision: capture screenshot based on VISION_MODE
let screenshotBase64: string | null = null;
let visionContext = "";
const shouldCaptureVision =
Config.VISION_MODE === "always" ||
(Config.VISION_MODE === "fallback" && elements.length === 0);
if (shouldCaptureVision) {
screenshotBase64 = captureScreenshotBase64();
if (elements.length === 0) {
visionContext =
"\n\nVISION_FALLBACK: The accessibility tree returned NO elements. " +
"A screenshot has been captured. The screen likely contains custom-drawn " +
"content (game, WebView, or Flutter). Try using coordinate-based taps on " +
"common UI positions, or use 'back'/'home' to navigate away.";
}
if (screenshotBase64 && llm.capabilities.supportsImages) {
console.log("Sending screenshot to LLM");
}
}
// 4. Build user message with all context
const foregroundLine = foregroundApp
? `FOREGROUND_APP: ${foregroundApp}\n\n`
: "";
const textContent =
`GOAL: ${goal}\n\n${foregroundLine}SCREEN_CONTEXT:\n${screenContext}${diffContext}${visionContext}`;
// Build content parts (text + optional image)
const userContent: ContentPart[] = [{ type: "text", text: textContent }];
if (screenshotBase64 && llm.capabilities.supportsImages) {
userContent.push({
type: "image",
base64: screenshotBase64,
mimeType: "image/png",
});
}
messages.push({ role: "user", content: userContent });
// Trim messages to keep within history limit
const trimmed = trimMessages(messages, Config.MAX_HISTORY_STEPS);
// 5. Reasoning: Get LLM decision
const llmStart = performance.now();
let decision: ActionDecision;
try {
decision = await getDecisionStreaming(llm, trimmed);
} catch (err) {
console.log(`LLM Error: ${(err as Error).message}`);
console.log("Falling back to wait action.");
decision = { action: "wait", reason: "LLM request failed, waiting" };
}
const llmLatency = performance.now() - llmStart;
// Log thinking and planning
if (decision.think) {
console.log(`Think: ${decision.think}`);
}
if (decision.plan) {
console.log(`Plan: ${decision.plan.join(" -> ")}`);
}
if (decision.planProgress) {
console.log(`Progress: ${decision.planProgress}`);
}
console.log(`Decision: ${decision.action}${decision.reason ?? "no reason"} (${Math.round(llmLatency)}ms)`);
// Append assistant response to conversation
messages.push({
role: "assistant",
content: JSON.stringify(decision),
});
// 6. Action: Execute the decision
const actionStart = performance.now();
let result: ActionResult;
try {
result = executeAction(decision);
} catch (err) {
console.log(`Action Error: ${(err as Error).message}`);
result = { success: false, message: (err as Error).message };
}
const actionLatency = performance.now() - actionStart;
// Log step
logger.logStep(
step + 1,
foregroundApp,
elements.length,
screenChanged,
decision,
result,
Math.round(llmLatency),
Math.round(actionLatency)
);
console.log(`Messages in context: ${trimmed.length}`);
// 7. Check for goal completion
if (decision.action === "done") {
console.log("\nTask completed successfully.");
logger.finalize(true);
return;
}
// Wait for UI to update
await Bun.sleep(Config.STEP_DELAY * 1000);
}
console.log("\nMax steps reached. Task may be incomplete.");
logger.finalize(false);
}
// ===========================================
// Entry Point
// ===========================================
async function main(): Promise<void> {
try {
Config.validate();
} catch (e) {
console.log(`Configuration Error: ${(e as Error).message}`);
return;
}
// Read user input from stdin
process.stdout.write("Enter your goal: ");
const goal = await new Promise<string>((resolve) => {
const reader = Bun.stdin.stream().getReader();
reader.read().then(({ value }) => {
resolve(new TextDecoder().decode(value).trim());
reader.releaseLock();
});
});
if (!goal) {
console.log("No goal provided. Exiting.");
return;
}
await runAgent(goal);
}
main();

535
src/llm-providers.ts Normal file
View File

@@ -0,0 +1,535 @@
/**
* LLM Provider module for Android Action Kernel.
* Supports OpenAI, Groq, AWS Bedrock, and OpenRouter (via Vercel AI SDK).
*
* Phase 3: Real multimodal vision (image content parts)
* Phase 4A: Multi-turn conversation memory (ChatMessage[] interface)
* Phase 5: Streaming responses (getDecisionStream)
*/
import OpenAI from "openai";
import {
BedrockRuntimeClient,
InvokeModelCommand,
InvokeModelWithResponseStreamCommand,
} from "@aws-sdk/client-bedrock-runtime";
import { generateText, streamText } from "ai";
import { createOpenRouter } from "@openrouter/ai-sdk-provider";
import { Config } from "./config.js";
import {
GROQ_API_BASE_URL,
BEDROCK_ANTHROPIC_MODELS,
BEDROCK_META_MODELS,
} from "./constants.js";
import type { ActionDecision } from "./actions.js";
// ===========================================
// System Prompt — all 15 actions + planning
// ===========================================
export const SYSTEM_PROMPT = `You are an Android Driver Agent. Your job is to achieve the user's goal by navigating the Android UI.
You will receive:
1. GOAL — the user's task.
2. FOREGROUND_APP — the currently active app package and activity.
3. SCREEN_CONTEXT — JSON array of interactive UI elements with coordinates and states.
4. SCREENSHOT — an image of the current screen (when available).
5. SCREEN_CHANGE — what changed since your last action (or if the screen is stuck).
6. VISION_FALLBACK — present when the accessibility tree is empty (custom UI / WebView).
Previous conversation turns contain your earlier observations and actions (multi-turn memory).
You must output ONLY a valid JSON object with your next action.
═══════════════════════════════════════════
THINKING & PLANNING
═══════════════════════════════════════════
Before each action, include a "think" field with your reasoning about the current state and what to do next.
Optionally include:
- "plan": an array of 3-5 high-level steps to achieve the goal
- "planProgress": a brief note on which plan step you're currently on
Example:
{"think": "I see the Settings app is open. I need to scroll down to find Display settings.", "plan": ["Open Settings", "Navigate to Display", "Change theme to dark", "Verify change"], "planProgress": "Step 2: navigating to Display", "action": "swipe", "direction": "up", "reason": "Scroll down to find Display option"}
═══════════════════════════════════════════
AVAILABLE ACTIONS (15 total)
═══════════════════════════════════════════
Navigation:
{"action": "tap", "coordinates": [x, y], "reason": "..."}
{"action": "longpress", "coordinates": [x, y], "reason": "..."}
{"action": "swipe", "direction": "up|down|left|right", "reason": "..."}
{"action": "enter", "reason": "Press Enter/submit"}
{"action": "back", "reason": "Navigate back"}
{"action": "home", "reason": "Go to home screen"}
Text Input:
{"action": "type", "text": "Hello World", "reason": "..."}
{"action": "clear", "reason": "Clear current text field before typing"}
App Control:
{"action": "launch", "package": "com.whatsapp", "reason": "Open WhatsApp"}
{"action": "launch", "uri": "https://maps.google.com/?q=pizza", "reason": "Open URL"}
{"action": "launch", "package": "com.whatsapp", "uri": "content://media/external/images/1", "extras": {"android.intent.extra.TEXT": "Check this"}, "reason": "Share image to WhatsApp"}
Data:
{"action": "screenshot", "reason": "Capture current screen"}
{"action": "screenshot", "filename": "order_confirmation.png", "reason": "Save proof"}
{"action": "clipboard_get", "reason": "Read clipboard contents"}
{"action": "clipboard_set", "text": "copied text", "reason": "Set clipboard"}
System:
{"action": "shell", "command": "am force-stop com.app.broken", "reason": "Kill crashed app"}
{"action": "wait", "reason": "Wait for screen to load"}
{"action": "done", "reason": "Task is complete"}
═══════════════════════════════════════════
ELEMENT PROPERTIES YOU WILL SEE
═══════════════════════════════════════════
Each element in SCREEN_CONTEXT has:
- text: visible label or content description
- center: [x, y] coordinates to tap
- action: suggested action — "tap", "type", "longpress", "scroll", or "read"
- enabled: false (only shown when disabled — DO NOT tap disabled elements!)
- checked: true (only shown for ON checkboxes/toggles)
- focused: true (only shown when field has input focus)
- hint: placeholder text (only shown when present)
- editable: true (only shown for text input fields)
- scrollable: true (only shown for scrollable containers)
═══════════════════════════════════════════
CRITICAL RULES
═══════════════════════════════════════════
1. DISABLED ELEMENTS: If "enabled": false, DO NOT tap or interact with it. Find an alternative.
2. TEXT INPUT: If "editable": true, use "clear" first if field has existing text, then "type".
3. ALREADY TYPED: Check your previous actions. Do NOT re-type text you already entered.
4. REPETITION: Do NOT tap the same coordinates twice in a row. If it didn't work, try something else.
5. STUCK: If SCREEN_CHANGE says "NOT changed", your last action had no effect. Change strategy.
6. APP LAUNCH: Use "launch" to directly open apps instead of hunting for icons on the home screen.
7. SCREENSHOTS: Use "screenshot" to capture proof of completed tasks (order confirmations, etc).
8. LONG PRESS: Use "longpress" when you see "longClickable": true (context menus, copy/paste, etc).
9. SCROLLING: If the item you need isn't visible, "swipe" up/down to scroll and find it.
10. MULTI-APP: To switch apps, use "home" then "launch" the next app. Or use "back" to return.
11. PASSWORDS: Never log or output the text of password fields.
12. DONE: Say "done" as soon as the goal is achieved. Don't keep acting after success.
13. SEARCH: After typing in a search field, use "enter" to submit the search.
14. SHARE: To send files/images between apps, use "launch" with uri + extras for Android intents.
15. CLEANUP: If a popup/ad appears, dismiss it with "back" or tap the close button, then continue.`;
// ===========================================
// Chat Message Types (Phase 4A)
// ===========================================
export type ContentPart =
| { type: "text"; text: string }
| { type: "image"; base64: string; mimeType: "image/png" | "image/jpeg" };
export interface ChatMessage {
role: "system" | "user" | "assistant";
content: string | ContentPart[];
}
// ===========================================
// Provider Interface
// ===========================================
export interface LLMProvider {
readonly capabilities: {
supportsImages: boolean;
supportsStreaming: boolean;
};
getDecision(messages: ChatMessage[]): Promise<ActionDecision>;
getDecisionStream?(messages: ChatMessage[]): AsyncIterable<string>;
}
// ===========================================
// Message Trimming (Phase 4A)
// ===========================================
/**
* Trims conversation messages to keep within history limit.
* Always keeps the system message. Drops oldest user/assistant pairs.
*/
export function trimMessages(
messages: ChatMessage[],
maxHistorySteps: number
): ChatMessage[] {
if (messages.length === 0) return messages;
// System message is always first
const system = messages[0].role === "system" ? messages[0] : null;
const rest = system ? messages.slice(1) : messages;
// Count user/assistant pairs (each step = 1 user + 1 assistant)
const maxMessages = maxHistorySteps * 2;
if (rest.length <= maxMessages) {
return messages;
}
const dropped = rest.length - maxMessages;
const stepsDropped = Math.floor(dropped / 2);
const trimmed = rest.slice(dropped);
// Insert a summary note
const summary: ChatMessage = {
role: "user",
content: `[${stepsDropped} earlier steps omitted]`,
};
return system ? [system, summary, ...trimmed] : [summary, ...trimmed];
}
// ===========================================
// OpenAI / Groq Provider
// ===========================================
class OpenAIProvider implements LLMProvider {
private client: OpenAI;
private model: string;
readonly capabilities: { supportsImages: boolean; supportsStreaming: boolean };
constructor() {
if (Config.LLM_PROVIDER === "groq") {
this.client = new OpenAI({
apiKey: Config.GROQ_API_KEY,
baseURL: GROQ_API_BASE_URL,
});
this.model = Config.GROQ_MODEL;
this.capabilities = { supportsImages: false, supportsStreaming: true };
} else {
this.client = new OpenAI({ apiKey: Config.OPENAI_API_KEY });
this.model = Config.OPENAI_MODEL;
this.capabilities = { supportsImages: true, supportsStreaming: true };
}
}
private toOpenAIMessages(
messages: ChatMessage[]
): OpenAI.ChatCompletionMessageParam[] {
return messages.map((msg) => {
if (typeof msg.content === "string") {
return { role: msg.role, content: msg.content } as OpenAI.ChatCompletionMessageParam;
}
// Convert ContentPart[] to OpenAI format
const parts: OpenAI.ChatCompletionContentPart[] = msg.content.map(
(part) => {
if (part.type === "text") {
return { type: "text" as const, text: part.text };
}
// Image — only for OpenAI (Groq skips images)
if (this.capabilities.supportsImages) {
return {
type: "image_url" as const,
image_url: {
url: `data:${part.mimeType};base64,${part.base64}`,
detail: "low" as const,
},
};
}
// Groq: convert image to text placeholder
return { type: "text" as const, text: "[Screenshot attached]" };
}
);
return {
role: msg.role,
content: parts,
} as OpenAI.ChatCompletionMessageParam;
});
}
async getDecision(messages: ChatMessage[]): Promise<ActionDecision> {
const openaiMessages = this.toOpenAIMessages(messages);
const response = await this.client.chat.completions.create({
model: this.model,
response_format: { type: "json_object" },
messages: openaiMessages,
});
return parseJsonResponse(response.choices[0].message.content ?? "{}");
}
async *getDecisionStream(messages: ChatMessage[]): AsyncIterable<string> {
const openaiMessages = this.toOpenAIMessages(messages);
const stream = await this.client.chat.completions.create({
model: this.model,
response_format: { type: "json_object" },
messages: openaiMessages,
stream: true,
});
for await (const chunk of stream) {
const content = chunk.choices[0]?.delta?.content;
if (content) yield content;
}
}
}
// ===========================================
// OpenRouter Provider (Vercel AI SDK)
// ===========================================
class OpenRouterProvider implements LLMProvider {
private openrouter: ReturnType<typeof createOpenRouter>;
private model: string;
readonly capabilities = { supportsImages: true, supportsStreaming: true };
constructor() {
this.openrouter = createOpenRouter({
apiKey: Config.OPENROUTER_API_KEY,
});
this.model = Config.OPENROUTER_MODEL;
}
private toVercelMessages(messages: ChatMessage[]) {
// Vercel AI SDK uses a similar format but we need to convert images
const systemMsg = messages.find((m) => m.role === "system");
const nonSystem = messages.filter((m) => m.role !== "system");
const converted = nonSystem.map((msg) => {
if (typeof msg.content === "string") {
return { role: msg.role as "user" | "assistant", content: msg.content };
}
const parts = msg.content.map((part) => {
if (part.type === "text") {
return { type: "text" as const, text: part.text };
}
return {
type: "image" as const,
image: `data:${part.mimeType};base64,${part.base64}`,
};
});
return { role: msg.role as "user" | "assistant", content: parts };
});
return {
system: typeof systemMsg?.content === "string" ? systemMsg.content : "",
messages: converted,
};
}
async getDecision(messages: ChatMessage[]): Promise<ActionDecision> {
const { system, messages: converted } = this.toVercelMessages(messages);
const result = await generateText({
model: this.openrouter.chat(this.model),
system,
messages: converted as any,
});
return parseJsonResponse(result.text);
}
async *getDecisionStream(messages: ChatMessage[]): AsyncIterable<string> {
const { system, messages: converted } = this.toVercelMessages(messages);
const result = streamText({
model: this.openrouter.chat(this.model),
system,
messages: converted as any,
});
for await (const chunk of result.textStream) {
yield chunk;
}
}
}
// ===========================================
// AWS Bedrock Provider
// ===========================================
class BedrockProvider implements LLMProvider {
private client: BedrockRuntimeClient;
private model: string;
readonly capabilities: { supportsImages: boolean; supportsStreaming: boolean };
constructor() {
this.client = new BedrockRuntimeClient({ region: Config.AWS_REGION });
this.model = Config.BEDROCK_MODEL;
// Only Anthropic models on Bedrock support images
this.capabilities = {
supportsImages: this.isAnthropicModel(),
supportsStreaming: true,
};
}
private isAnthropicModel(): boolean {
return BEDROCK_ANTHROPIC_MODELS.some((id) => this.model.includes(id));
}
private isMetaModel(): boolean {
return BEDROCK_META_MODELS.some((id) =>
this.model.toLowerCase().includes(id)
);
}
private buildAnthropicMessages(messages: ChatMessage[]) {
const systemMsg = messages.find((m) => m.role === "system");
const nonSystem = messages.filter((m) => m.role !== "system");
const converted = nonSystem.map((msg) => {
if (typeof msg.content === "string") {
return { role: msg.role, content: msg.content };
}
const parts = msg.content.map((part) => {
if (part.type === "text") {
return { type: "text", text: part.text };
}
return {
type: "image",
source: {
type: "base64",
media_type: part.mimeType,
data: part.base64,
},
};
});
return { role: msg.role, content: parts };
});
return {
system: typeof systemMsg?.content === "string" ? systemMsg.content : "",
messages: converted,
};
}
private buildRequest(messages: ChatMessage[]): string {
if (this.isAnthropicModel()) {
const { system, messages: converted } = this.buildAnthropicMessages(messages);
return JSON.stringify({
anthropic_version: "bedrock-2023-05-31",
max_tokens: 1024,
system,
messages: converted,
});
}
// For Meta/other models, flatten to single prompt (no multi-turn / image support)
const systemContent = messages.find((m) => m.role === "system");
const userMessages = messages
.filter((m) => m.role === "user")
.map((m) =>
typeof m.content === "string"
? m.content
: m.content
.filter((p) => p.type === "text")
.map((p) => (p as { type: "text"; text: string }).text)
.join("\n")
);
const lastUserContent = userMessages[userMessages.length - 1] ?? "";
const sysText =
typeof systemContent?.content === "string" ? systemContent.content : "";
if (this.isMetaModel()) {
return JSON.stringify({
prompt: `<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n${sysText}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n${lastUserContent}\n\nRespond with ONLY a valid JSON object, no other text.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n`,
max_gen_len: 512,
temperature: 0.1,
});
}
return JSON.stringify({
inputText: `${sysText}\n\n${lastUserContent}\n\nRespond with ONLY a valid JSON object.`,
textGenerationConfig: {
maxTokenCount: 512,
temperature: 0.1,
},
});
}
private extractResponse(responseBody: Record<string, any>): string {
if (this.isAnthropicModel()) {
return responseBody.content[0].text;
}
if (this.isMetaModel()) {
return responseBody.generation ?? "";
}
return responseBody.results[0].outputText;
}
async getDecision(messages: ChatMessage[]): Promise<ActionDecision> {
const requestBody = this.buildRequest(messages);
const command = new InvokeModelCommand({
modelId: this.model,
body: new TextEncoder().encode(requestBody),
contentType: "application/json",
accept: "application/json",
});
const response = await this.client.send(command);
const responseBody = JSON.parse(new TextDecoder().decode(response.body));
const resultText = this.extractResponse(responseBody);
return parseJsonResponse(resultText);
}
async *getDecisionStream(messages: ChatMessage[]): AsyncIterable<string> {
if (!this.isAnthropicModel()) {
// Fallback: non-streaming for non-Anthropic models
const decision = await this.getDecision(messages);
yield JSON.stringify(decision);
return;
}
const { system, messages: converted } = this.buildAnthropicMessages(messages);
const requestBody = JSON.stringify({
anthropic_version: "bedrock-2023-05-31",
max_tokens: 1024,
system,
messages: converted,
});
const command = new InvokeModelWithResponseStreamCommand({
modelId: this.model,
body: new TextEncoder().encode(requestBody),
contentType: "application/json",
});
const response = await this.client.send(command);
if (response.body) {
for await (const event of response.body) {
if (event.chunk?.bytes) {
const data = JSON.parse(new TextDecoder().decode(event.chunk.bytes));
if (data.type === "content_block_delta" && data.delta?.text) {
yield data.delta.text;
}
}
}
}
}
}
// ===========================================
// Shared JSON Parsing
// ===========================================
function parseJsonResponse(text: string): ActionDecision {
try {
return JSON.parse(text);
} catch {
// Try to extract JSON from markdown code blocks or mixed text
const match = text.match(/\{[\s\S]*?\}/);
if (match) {
try {
return JSON.parse(match[0]);
} catch {
// fall through
}
}
console.log(`Warning: Could not parse LLM response: ${text.slice(0, 200)}`);
return { action: "wait", reason: "Failed to parse response, waiting" };
}
}
// ===========================================
// Factory
// ===========================================
export function getLlmProvider(): LLMProvider {
if (Config.LLM_PROVIDER === "bedrock") {
return new BedrockProvider();
}
if (Config.LLM_PROVIDER === "openrouter") {
return new OpenRouterProvider();
}
return new OpenAIProvider();
}

129
src/logger.ts Normal file
View File

@@ -0,0 +1,129 @@
/**
* Session logging for Android Action Kernel.
* Writes incremental .partial.json after each step (crash-safe),
* and a final .json summary at session end.
*/
import { mkdirSync, writeFileSync } from "fs";
import { join } from "path";
import type { ActionDecision } from "./actions.js";
export interface StepLog {
step: number;
timestamp: string;
foregroundApp: string | null;
elementCount: number;
screenChanged: boolean;
llmDecision: {
action: string;
reason?: string;
coordinates?: [number, number];
text?: string;
think?: string;
plan?: string[];
planProgress?: string;
};
actionResult: {
success: boolean;
message: string;
};
llmLatencyMs: number;
actionLatencyMs: number;
}
export interface SessionSummary {
sessionId: string;
goal: string;
provider: string;
model: string;
startTime: string;
endTime: string;
totalSteps: number;
successCount: number;
failCount: number;
completed: boolean;
steps: StepLog[];
}
export class SessionLogger {
private sessionId: string;
private logDir: string;
private steps: StepLog[] = [];
private goal: string;
private provider: string;
private model: string;
private startTime: string;
constructor(logDir: string, goal: string, provider: string, model: string) {
this.sessionId = `${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
this.logDir = logDir;
this.goal = goal;
this.provider = provider;
this.model = model;
this.startTime = new Date().toISOString();
mkdirSync(this.logDir, { recursive: true });
}
logStep(
step: number,
foregroundApp: string | null,
elementCount: number,
screenChanged: boolean,
decision: ActionDecision,
result: { success: boolean; message: string },
llmLatencyMs: number,
actionLatencyMs: number
): void {
const entry: StepLog = {
step,
timestamp: new Date().toISOString(),
foregroundApp,
elementCount,
screenChanged,
llmDecision: {
action: decision.action,
reason: decision.reason,
coordinates: decision.coordinates,
text: decision.text,
think: decision.think,
plan: decision.plan,
planProgress: decision.planProgress,
},
actionResult: {
success: result.success,
message: result.message,
},
llmLatencyMs,
actionLatencyMs,
};
this.steps.push(entry);
// Write partial file after each step (crash-safe)
const partialPath = join(this.logDir, `${this.sessionId}.partial.json`);
writeFileSync(partialPath, JSON.stringify(this.buildSummary(false), null, 2));
}
finalize(completed: boolean): void {
const summary = this.buildSummary(completed);
const finalPath = join(this.logDir, `${this.sessionId}.json`);
writeFileSync(finalPath, JSON.stringify(summary, null, 2));
console.log(`Session log saved: ${finalPath}`);
}
private buildSummary(completed: boolean): SessionSummary {
return {
sessionId: this.sessionId,
goal: this.goal,
provider: this.provider,
model: this.model,
startTime: this.startTime,
endTime: new Date().toISOString(),
totalSteps: this.steps.length,
successCount: this.steps.filter((s) => s.actionResult.success).length,
failCount: this.steps.filter((s) => !s.actionResult.success).length,
completed,
steps: this.steps,
};
}
}

249
src/sanitizer.ts Normal file
View File

@@ -0,0 +1,249 @@
/**
* XML Sanitizer for Android Action Kernel.
* Parses Android Accessibility XML and extracts interactive UI elements
* with full state information and parent-child hierarchy context.
*/
import { XMLParser } from "fast-xml-parser";
export interface UIElement {
id: string;
text: string;
type: string;
bounds: string;
center: [number, number];
size: [number, number];
clickable: boolean;
editable: boolean;
enabled: boolean;
checked: boolean;
focused: boolean;
selected: boolean;
scrollable: boolean;
longClickable: boolean;
password: boolean;
hint: string;
action: "tap" | "type" | "longpress" | "scroll" | "read";
parent: string;
depth: number;
}
/**
* Compute a hash of element texts/ids for screen state comparison.
*/
export function computeScreenHash(elements: UIElement[]): string {
const parts = elements.map(
(e) => `${e.id}|${e.text}|${e.center[0]},${e.center[1]}|${e.enabled}|${e.checked}`
);
return parts.join(";");
}
/**
* Parses Android Accessibility XML and returns a rich list of interactive elements.
* Preserves state (enabled, checked, focused) and hierarchy context.
*/
export function getInteractiveElements(xmlContent: string): UIElement[] {
const parser = new XMLParser({
ignoreAttributes: false,
attributeNamePrefix: "@_",
allowBooleanAttributes: true,
});
let parsed: unknown;
try {
parsed = parser.parse(xmlContent);
} catch {
console.log("Warning: Error parsing XML. The screen might be loading.");
return [];
}
const elements: UIElement[] = [];
function walk(node: any, parentLabel: string, depth: number): void {
if (!node || typeof node !== "object") return;
if (node["@_bounds"]) {
const isClickable = node["@_clickable"] === "true";
const isLongClickable = node["@_long-clickable"] === "true";
const isScrollable = node["@_scrollable"] === "true";
const isEnabled = node["@_enabled"] !== "false"; // default true
const isChecked = node["@_checked"] === "true";
const isFocused = node["@_focused"] === "true";
const isSelected = node["@_selected"] === "true";
const isPassword = node["@_password"] === "true";
const elementClass = node["@_class"] ?? "";
const isEditable =
elementClass.includes("EditText") ||
elementClass.includes("AutoCompleteTextView") ||
node["@_editable"] === "true";
const text: string = node["@_text"] ?? "";
const desc: string = node["@_content-desc"] ?? "";
const resourceId: string = node["@_resource-id"] ?? "";
const hint: string = node["@_hint"] ?? "";
// Build a label for this node to use as parent context for children
const typeName = elementClass.split(".").pop() ?? "";
const nodeLabel = text || desc || resourceId.split("/").pop() || typeName;
// Determine if this element should be included
const isInteractive = isClickable || isEditable || isLongClickable || isScrollable;
const hasContent = !!(text || desc);
if (isInteractive || hasContent) {
const bounds: string = node["@_bounds"];
try {
const coords = bounds
.replace("][", ",")
.replace("[", "")
.replace("]", "")
.split(",")
.map(Number);
const [x1, y1, x2, y2] = coords;
const centerX = Math.floor((x1 + x2) / 2);
const centerY = Math.floor((y1 + y2) / 2);
const width = x2 - x1;
const height = y2 - y1;
// Skip zero-size elements (invisible)
if (width <= 0 || height <= 0) {
// still walk children
} else {
let suggestedAction: UIElement["action"];
if (isEditable) suggestedAction = "type";
else if (isLongClickable && !isClickable) suggestedAction = "longpress";
else if (isScrollable && !isClickable) suggestedAction = "scroll";
else if (isClickable) suggestedAction = "tap";
else suggestedAction = "read";
elements.push({
id: resourceId,
text: text || desc,
type: typeName,
bounds,
center: [centerX, centerY],
size: [width, height],
clickable: isClickable,
editable: isEditable,
enabled: isEnabled,
checked: isChecked,
focused: isFocused,
selected: isSelected,
scrollable: isScrollable,
longClickable: isLongClickable,
password: isPassword,
hint: hint,
action: suggestedAction,
parent: parentLabel,
depth,
});
}
} catch {
// Skip malformed bounds
}
}
// Recurse with updated parent label
walkChildren(node, nodeLabel, depth + 1);
return;
}
// No bounds on this node — just recurse
walkChildren(node, parentLabel, depth);
}
function walkChildren(node: any, parentLabel: string, depth: number): void {
if (node.node) {
const children = Array.isArray(node.node) ? node.node : [node.node];
for (const child of children) {
walk(child, parentLabel, depth);
}
}
if (node.hierarchy) {
walk(node.hierarchy, parentLabel, depth);
}
}
walk(parsed, "root", 0);
return elements;
}
// ===========================================
// Smart Element Filtering (Phase 2A)
// ===========================================
/**
* Compact representation sent to the LLM — only essential fields.
* Non-default flags are included conditionally to minimize tokens.
*/
export interface CompactUIElement {
text: string;
center: [number, number];
action: UIElement["action"];
// Only included when non-default
enabled?: false;
checked?: true;
focused?: true;
hint?: string;
editable?: true;
scrollable?: true;
}
/**
* Strips a full UIElement to its compact form, omitting default-valued flags.
*/
export function compactElement(el: UIElement): CompactUIElement {
const compact: CompactUIElement = {
text: el.text,
center: el.center,
action: el.action,
};
if (!el.enabled) compact.enabled = false;
if (el.checked) compact.checked = true;
if (el.focused) compact.focused = true;
if (el.hint) compact.hint = el.hint;
if (el.editable) compact.editable = true;
if (el.scrollable) compact.scrollable = true;
return compact;
}
/**
* Scores an element for relevance to the LLM.
*/
function scoreElement(el: UIElement): number {
let score = 0;
if (el.enabled) score += 10;
if (el.editable) score += 8;
if (el.focused) score += 6;
if (el.clickable || el.longClickable) score += 5;
if (el.text) score += 3;
return score;
}
/**
* Deduplicates elements by center coordinates (within tolerance),
* scores them, and returns the top N as compact elements.
*/
export function filterElements(
elements: UIElement[],
limit: number
): CompactUIElement[] {
// Deduplicate by center coordinates (5px tolerance)
const seen = new Map<string, UIElement>();
for (const el of elements) {
const bucketX = Math.round(el.center[0] / 5) * 5;
const bucketY = Math.round(el.center[1] / 5) * 5;
const key = `${bucketX},${bucketY}`;
const existing = seen.get(key);
if (!existing || scoreElement(el) > scoreElement(existing)) {
seen.set(key, el);
}
}
// Score, sort descending, take top N
const deduped = Array.from(seen.values());
deduped.sort((a, b) => scoreElement(b) - scoreElement(a));
return deduped.slice(0, limit).map(compactElement);
}