initial commit

This commit is contained in:
Sanju Sivalingam
2026-02-06 08:59:43 +05:30
commit 477d99861c
12 changed files with 1386 additions and 0 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

BIN
android-action-kernel/.DS_Store vendored Normal file

Binary file not shown.

View File

@@ -0,0 +1,63 @@
# Android Action Kernel Configuration (TypeScript/Bun)
# Copy this file to .env and fill in your settings
# cp .env.example .env
# ===========================================
# Agent Configuration
# ===========================================
MAX_STEPS=30 # Maximum steps before stopping (30 for complex multi-app tasks)
STEP_DELAY=2 # Seconds to wait between steps
MAX_RETRIES=3 # Retries on ADB/network failures
STUCK_THRESHOLD=3 # Steps before stuck-loop recovery kicks in
# ===========================================
# Vision Fallback (when accessibility tree is empty)
# ===========================================
VISION_ENABLED=true # Auto-capture screenshot when UI elements not found
# ===========================================
# LLM Provider: "groq", "openai", "bedrock", or "openrouter"
# ===========================================
LLM_PROVIDER=groq
# ===========================================
# Groq Configuration (Free tier available)
# Get your key at: https://console.groq.com
# ===========================================
GROQ_API_KEY=gsk_your_key_here
GROQ_MODEL=llama-3.3-70b-versatile
# Other models: llama-3.1-8b-instant (faster, higher rate limits)
# ===========================================
# OpenAI Configuration
# Get your key at: https://platform.openai.com
# ===========================================
OPENAI_API_KEY=sk-your_key_here
OPENAI_MODEL=gpt-4o
# Other models: gpt-4o-mini (faster, cheaper)
# ===========================================
# AWS Bedrock Configuration
# Uses AWS credential chain (run 'aws configure' first)
# ===========================================
AWS_REGION=us-east-1
BEDROCK_MODEL=us.meta.llama3-3-70b-instruct-v1:0
# Other models:
# anthropic.claude-3-sonnet-20240229-v1:0
# anthropic.claude-3-haiku-20240307-v1:0
# meta.llama3-8b-instruct-v1:0
# ===========================================
# OpenRouter Configuration (via Vercel AI SDK)
# Access 200+ models through a single API
# Get your key at: https://openrouter.ai/keys
# ===========================================
OPENROUTER_API_KEY=sk-or-v1-your_key_here
OPENROUTER_MODEL=anthropic/claude-3.5-sonnet
# Popular models:
# anthropic/claude-3.5-sonnet (best reasoning)
# openai/gpt-4o (multimodal)
# google/gemini-2.0-flash-001 (fast + cheap)
# meta-llama/llama-3.3-70b-instruct (open source)
# mistralai/mistral-large-latest (European)
# deepseek/deepseek-chat (cost efficient)

4
android-action-kernel/.gitignore vendored Normal file
View File

@@ -0,0 +1,4 @@
node_modules/
dist/
bun.lock
.env

View File

@@ -0,0 +1,22 @@
{
"name": "android-action-kernel",
"version": "1.0.0",
"description": "AI agent that controls Android devices through the accessibility API - TypeScript/Bun edition",
"type": "module",
"scripts": {
"start": "bun run src/kernel.ts",
"build": "bun build src/kernel.ts --outdir dist --target bun",
"typecheck": "tsc --noEmit"
},
"dependencies": {
"@aws-sdk/client-bedrock-runtime": "^3.700.0",
"@openrouter/ai-sdk-provider": "^2.1.1",
"ai": "^6.0.72",
"fast-xml-parser": "^4.5.0",
"openai": "^4.73.0"
},
"devDependencies": {
"@types/bun": "^1.1.0",
"typescript": "^5.6.0"
}
}

View File

@@ -0,0 +1,322 @@
/**
* Action execution module for Android Action Kernel.
* Handles all ADB commands for interacting with Android devices.
*
* Supported actions:
* tap, type, enter, swipe, home, back, wait, done,
* longpress, screenshot, launch, clear, clipboard_get, clipboard_set, shell
*/
import { Config } from "./config.js";
import {
KEYCODE_ENTER,
KEYCODE_HOME,
KEYCODE_BACK,
KEYCODE_DEL,
KEYCODE_MOVE_HOME,
KEYCODE_MOVE_END,
SWIPE_COORDS,
SWIPE_DURATION_MS,
LONG_PRESS_DURATION_MS,
DEVICE_SCREENSHOT_PATH,
LOCAL_SCREENSHOT_PATH,
} from "./constants.js";
export interface ActionDecision {
action: string;
coordinates?: [number, number];
text?: string;
direction?: string;
reason?: string;
// launch action
package?: string;
activity?: string;
uri?: string;
extras?: Record<string, string>;
// shell action
command?: string;
// screenshot action
filename?: string;
}
export interface ActionResult {
success: boolean;
message: string;
data?: string;
}
/**
* Executes a shell command via ADB with retry support.
*/
export function runAdbCommand(command: string[], retries = Config.MAX_RETRIES): string {
for (let attempt = 0; attempt <= retries; attempt++) {
const result = Bun.spawnSync([Config.ADB_PATH, ...command], {
stdout: "pipe",
stderr: "pipe",
});
const stdout = result.stdout.toString().trim();
const stderr = result.stderr.toString().trim();
if (stderr && stderr.toLowerCase().includes("error")) {
if (attempt < retries) {
const delay = Math.pow(2, attempt) * 1000;
console.log(`ADB Error (attempt ${attempt + 1}/${retries + 1}): ${stderr}`);
console.log(`Retrying in ${delay / 1000}s...`);
Bun.sleepSync(delay);
continue;
}
console.log(`ADB Error (all retries exhausted): ${stderr}`);
}
return stdout;
}
return "";
}
/**
* Executes the action decided by the LLM. Returns a result for the kernel to track.
*/
export function executeAction(action: ActionDecision): ActionResult {
switch (action.action) {
case "tap":
return executeTap(action);
case "type":
return executeType(action);
case "enter":
return executeEnter();
case "swipe":
return executeSwipe(action);
case "home":
return executeHome();
case "back":
return executeBack();
case "wait":
return executeWait();
case "done":
return executeDone(action);
case "longpress":
return executeLongPress(action);
case "screenshot":
return executeScreenshot(action);
case "launch":
return executeLaunch(action);
case "clear":
return executeClear();
case "clipboard_get":
return executeClipboardGet();
case "clipboard_set":
return executeClipboardSet(action);
case "shell":
return executeShell(action);
default:
console.log(`Warning: Unknown action: ${action.action}`);
return { success: false, message: `Unknown action: ${action.action}` };
}
}
// ===========================================
// Original actions (enhanced)
// ===========================================
function executeTap(action: ActionDecision): ActionResult {
const [x, y] = action.coordinates ?? [0, 0];
console.log(`Tapping: (${x}, ${y})`);
runAdbCommand(["shell", "input", "tap", String(x), String(y)]);
return { success: true, message: `Tapped (${x}, ${y})` };
}
function executeType(action: ActionDecision): ActionResult {
const text = action.text ?? "";
if (!text) return { success: false, message: "No text to type" };
// ADB requires %s for spaces, escape special shell characters
const escapedText = text
.replaceAll("\\", "\\\\")
.replaceAll("\"", "\\\"")
.replaceAll("'", "\\'")
.replaceAll(" ", "%s")
.replaceAll("&", "\\&")
.replaceAll("|", "\\|")
.replaceAll(";", "\\;")
.replaceAll("(", "\\(")
.replaceAll(")", "\\)")
.replaceAll("<", "\\<")
.replaceAll(">", "\\>");
console.log(`Typing: ${text}`);
runAdbCommand(["shell", "input", "text", escapedText]);
return { success: true, message: `Typed "${text}"` };
}
function executeEnter(): ActionResult {
console.log("Pressing Enter");
runAdbCommand(["shell", "input", "keyevent", KEYCODE_ENTER]);
return { success: true, message: "Pressed Enter" };
}
function executeSwipe(action: ActionDecision): ActionResult {
const direction = action.direction ?? "up";
const coords = SWIPE_COORDS[direction] ?? SWIPE_COORDS["up"];
console.log(`Swiping ${direction}`);
runAdbCommand([
"shell", "input", "swipe",
String(coords[0]), String(coords[1]),
String(coords[2]), String(coords[3]),
SWIPE_DURATION_MS,
]);
return { success: true, message: `Swiped ${direction}` };
}
function executeHome(): ActionResult {
console.log("Going Home");
runAdbCommand(["shell", "input", "keyevent", KEYCODE_HOME]);
return { success: true, message: "Went to home screen" };
}
function executeBack(): ActionResult {
console.log("Going Back");
runAdbCommand(["shell", "input", "keyevent", KEYCODE_BACK]);
return { success: true, message: "Went back" };
}
function executeWait(): ActionResult {
console.log("Waiting...");
Bun.sleepSync(2000);
return { success: true, message: "Waited 2s" };
}
function executeDone(action: ActionDecision): ActionResult {
console.log(`Goal Achieved: ${action.reason ?? "Task complete"}`);
return { success: true, message: "done" };
}
// ===========================================
// New actions
// ===========================================
/**
* Long press at coordinates (opens context menus, triggers drag mode, etc.)
*/
function executeLongPress(action: ActionDecision): ActionResult {
const [x, y] = action.coordinates ?? [0, 0];
console.log(`Long pressing: (${x}, ${y})`);
// A swipe from the same point to the same point with long duration = long press
runAdbCommand([
"shell", "input", "swipe",
String(x), String(y), String(x), String(y),
LONG_PRESS_DURATION_MS,
]);
return { success: true, message: `Long pressed (${x}, ${y})` };
}
/**
* Captures a screenshot and saves it locally.
*/
function executeScreenshot(action: ActionDecision): ActionResult {
const filename = action.filename ?? LOCAL_SCREENSHOT_PATH;
console.log(`Taking screenshot → ${filename}`);
runAdbCommand(["shell", "screencap", "-p", DEVICE_SCREENSHOT_PATH]);
runAdbCommand(["pull", DEVICE_SCREENSHOT_PATH, filename]);
return { success: true, message: `Screenshot saved to ${filename}`, data: filename };
}
/**
* Launches an app by package name, activity, or URI intent.
*
* Examples the LLM can produce:
* { action: "launch", package: "com.whatsapp" }
* { action: "launch", package: "com.whatsapp", activity: ".HomeActivity" }
* { action: "launch", uri: "https://maps.google.com/?q=pizza+near+me" }
* { action: "launch", package: "com.whatsapp", uri: "content://media/external/images/1",
* extras: { "android.intent.extra.TEXT": "Check this out" } }
*/
function executeLaunch(action: ActionDecision): ActionResult {
const args: string[] = ["shell", "am", "start"];
if (action.uri) {
args.push("-a", "android.intent.action.VIEW");
args.push("-d", action.uri);
}
if (action.package && action.activity) {
args.push("-n", `${action.package}/${action.activity}`);
} else if (action.package) {
// Launch the default activity for the package
const launchResult = runAdbCommand([
"shell", "monkey", "-p", action.package, "-c",
"android.intent.category.LAUNCHER", "1",
]);
console.log(`Launching: ${action.package}`);
return { success: true, message: `Launched ${action.package}`, data: launchResult };
}
// Attach intent extras
if (action.extras) {
for (const [key, value] of Object.entries(action.extras)) {
args.push("--es", key, value);
}
}
const label = action.package ?? action.uri ?? "intent";
console.log(`Launching: ${label}`);
const result = runAdbCommand(args);
return { success: true, message: `Launched ${label}`, data: result };
}
/**
* Clears the currently focused text field.
* Selects all text then deletes it.
*/
function executeClear(): ActionResult {
console.log("Clearing text field");
// Move to end of field
runAdbCommand(["shell", "input", "keyevent", KEYCODE_MOVE_END]);
// Select all: Shift+Home
runAdbCommand(["shell", "input", "keyevent", "--longpress", KEYCODE_MOVE_HOME]);
// Delete selected text
runAdbCommand(["shell", "input", "keyevent", KEYCODE_DEL]);
return { success: true, message: "Cleared text field" };
}
/**
* Reads the current clipboard contents.
*/
function executeClipboardGet(): ActionResult {
console.log("Reading clipboard");
// Use am broadcast to get clipboard via a helper or service log
// On Android 10+, direct clipboard access via ADB is restricted.
// Workaround: dump the clipboard service log
const result = runAdbCommand(["shell", "cmd", "clipboard", "get-text"]);
if (result) {
console.log(`Clipboard: ${result.slice(0, 100)}`);
return { success: true, message: `Clipboard: ${result}`, data: result };
}
// Fallback for older Android versions
const fallback = runAdbCommand([
"shell", "service", "call", "clipboard", "2", "i32", "1",
]);
return { success: true, message: `Clipboard (raw): ${fallback}`, data: fallback };
}
/**
* Sets the clipboard to the given text.
*/
function executeClipboardSet(action: ActionDecision): ActionResult {
const text = action.text ?? "";
if (!text) return { success: false, message: "No text to set on clipboard" };
console.log(`Setting clipboard: ${text.slice(0, 50)}...`);
runAdbCommand(["shell", "cmd", "clipboard", "set-text", text]);
return { success: true, message: `Clipboard set to "${text.slice(0, 50)}"` };
}
/**
* Runs an arbitrary ADB shell command. Use sparingly for edge cases.
*/
function executeShell(action: ActionDecision): ActionResult {
const cmd = action.command ?? "";
if (!cmd) return { success: false, message: "No command provided" };
console.log(`Shell: ${cmd}`);
const result = runAdbCommand(["shell", ...cmd.split(" ")]);
return { success: true, message: `Shell output: ${result.slice(0, 200)}`, data: result };
}

View File

@@ -0,0 +1,82 @@
/**
* Configuration management for Android Action Kernel.
* Bun natively loads .env files — no dotenv needed.
*/
import {
DEVICE_DUMP_PATH,
LOCAL_DUMP_PATH,
DEVICE_SCREENSHOT_PATH,
LOCAL_SCREENSHOT_PATH,
DEFAULT_MAX_STEPS,
DEFAULT_STEP_DELAY,
DEFAULT_GROQ_MODEL,
DEFAULT_OPENAI_MODEL,
DEFAULT_BEDROCK_MODEL,
DEFAULT_MAX_RETRIES,
DEFAULT_STUCK_THRESHOLD,
DEFAULT_VISION_ENABLED,
} from "./constants.js";
function env(key: string, fallback = ""): string {
return process.env[key] ?? fallback;
}
export const Config = {
// ADB Configuration
ADB_PATH: env("ADB_PATH", "adb"),
SCREEN_DUMP_PATH: DEVICE_DUMP_PATH,
LOCAL_DUMP_PATH: LOCAL_DUMP_PATH,
DEVICE_SCREENSHOT_PATH: DEVICE_SCREENSHOT_PATH,
LOCAL_SCREENSHOT_PATH: LOCAL_SCREENSHOT_PATH,
// Agent Configuration
MAX_STEPS: parseInt(env("MAX_STEPS", String(DEFAULT_MAX_STEPS)), 10),
STEP_DELAY: parseFloat(env("STEP_DELAY", String(DEFAULT_STEP_DELAY))),
MAX_RETRIES: parseInt(env("MAX_RETRIES", String(DEFAULT_MAX_RETRIES)), 10),
STUCK_THRESHOLD: parseInt(env("STUCK_THRESHOLD", String(DEFAULT_STUCK_THRESHOLD)), 10),
// Vision fallback (when accessibility tree is empty)
VISION_ENABLED: env("VISION_ENABLED", String(DEFAULT_VISION_ENABLED)) === "true",
// LLM Provider: "groq", "openai", "bedrock", or "openrouter"
LLM_PROVIDER: env("LLM_PROVIDER", "groq"),
// Groq Configuration
GROQ_API_KEY: env("GROQ_API_KEY"),
GROQ_MODEL: env("GROQ_MODEL", DEFAULT_GROQ_MODEL),
// OpenAI Configuration
OPENAI_API_KEY: env("OPENAI_API_KEY"),
OPENAI_MODEL: env("OPENAI_MODEL", DEFAULT_OPENAI_MODEL),
// AWS Bedrock Configuration
AWS_REGION: env("AWS_REGION", "us-east-1"),
BEDROCK_MODEL: env("BEDROCK_MODEL", DEFAULT_BEDROCK_MODEL),
// OpenRouter Configuration (via Vercel AI SDK)
OPENROUTER_API_KEY: env("OPENROUTER_API_KEY"),
OPENROUTER_MODEL: env("OPENROUTER_MODEL", "anthropic/claude-3.5-sonnet"),
getModel(): string {
const provider = Config.LLM_PROVIDER;
if (provider === "groq") return Config.GROQ_MODEL;
if (provider === "bedrock") return Config.BEDROCK_MODEL;
if (provider === "openrouter") return Config.OPENROUTER_MODEL;
return Config.OPENAI_MODEL;
},
validate(): void {
const provider = Config.LLM_PROVIDER;
if (provider === "groq" && !Config.GROQ_API_KEY) {
throw new Error("GROQ_API_KEY is required when using Groq provider");
}
if (provider === "openai" && !Config.OPENAI_API_KEY) {
throw new Error("OPENAI_API_KEY is required when using OpenAI provider");
}
if (provider === "openrouter" && !Config.OPENROUTER_API_KEY) {
throw new Error("OPENROUTER_API_KEY is required when using OpenRouter provider");
}
// Bedrock uses AWS credential chain, no explicit validation needed
},
};

View File

@@ -0,0 +1,78 @@
/**
* Constants for Android Action Kernel.
* All magic strings, URLs, and fixed values in one place.
*/
// ===========================================
// API Endpoints
// ===========================================
export const GROQ_API_BASE_URL = "https://api.groq.com/openai/v1";
// ===========================================
// ADB Key Codes
// ===========================================
export const KEYCODE_ENTER = "66";
export const KEYCODE_HOME = "KEYCODE_HOME";
export const KEYCODE_BACK = "KEYCODE_BACK";
export const KEYCODE_DEL = "67";
export const KEYCODE_FORWARD_DEL = "112";
export const KEYCODE_MOVE_HOME = "122";
export const KEYCODE_MOVE_END = "123";
export const KEYCODE_MENU = "82";
export const KEYCODE_TAB = "61";
export const KEYCODE_ESCAPE = "111";
export const KEYCODE_DPAD_UP = "19";
export const KEYCODE_DPAD_DOWN = "20";
export const KEYCODE_DPAD_LEFT = "21";
export const KEYCODE_DPAD_RIGHT = "22";
export const KEYCODE_VOLUME_UP = "24";
export const KEYCODE_VOLUME_DOWN = "25";
export const KEYCODE_POWER = "26";
// ===========================================
// Default Screen Coordinates (for swipe actions)
// Adjust based on target device resolution
// ===========================================
export const SCREEN_CENTER_X = 540;
export const SCREEN_CENTER_Y = 1200;
// Swipe coordinates: [start_x, start_y, end_x, end_y]
export const SWIPE_COORDS: Record<string, [number, number, number, number]> = {
up: [SCREEN_CENTER_X, 1500, SCREEN_CENTER_X, 500],
down: [SCREEN_CENTER_X, 500, SCREEN_CENTER_X, 1500],
left: [800, SCREEN_CENTER_Y, 200, SCREEN_CENTER_Y],
right: [200, SCREEN_CENTER_Y, 800, SCREEN_CENTER_Y],
};
export const SWIPE_DURATION_MS = "300";
export const LONG_PRESS_DURATION_MS = "1000";
// ===========================================
// Default Models
// ===========================================
export const DEFAULT_GROQ_MODEL = "llama-3.3-70b-versatile";
export const DEFAULT_OPENAI_MODEL = "gpt-4o";
export const DEFAULT_BEDROCK_MODEL = "us.meta.llama3-3-70b-instruct-v1:0";
export const DEFAULT_OPENROUTER_MODEL = "anthropic/claude-3.5-sonnet";
// ===========================================
// Bedrock Model Identifiers
// ===========================================
export const BEDROCK_ANTHROPIC_MODELS = ["anthropic"];
export const BEDROCK_META_MODELS = ["meta", "llama"];
// ===========================================
// File Paths
// ===========================================
export const DEVICE_DUMP_PATH = "/sdcard/window_dump.xml";
export const LOCAL_DUMP_PATH = "window_dump.xml";
export const DEVICE_SCREENSHOT_PATH = "/sdcard/kernel_screenshot.png";
export const LOCAL_SCREENSHOT_PATH = "kernel_screenshot.png";
// ===========================================
// Agent Defaults
// ===========================================
export const DEFAULT_MAX_STEPS = 30;
export const DEFAULT_STEP_DELAY = 2.0;
export const DEFAULT_MAX_RETRIES = 3;
export const DEFAULT_STUCK_THRESHOLD = 3;
export const DEFAULT_VISION_ENABLED = true;

View File

@@ -0,0 +1,298 @@
/**
* Android Action Kernel - Main Agent Loop (TypeScript/Bun Edition)
*
* An AI agent that controls Android devices through the accessibility API.
* Uses LLMs to make decisions based on screen context.
*
* Features:
* - Perception → Reasoning → Action loop
* - Screen state diffing (stuck loop detection)
* - Error recovery with retries
* - Vision fallback when accessibility tree is empty
* - Dynamic early exit on goal completion
* - 15 actions: tap, type, enter, swipe, home, back, wait, done,
* longpress, screenshot, launch, clear, clipboard_get, clipboard_set, shell
*
* Usage:
* bun run src/kernel.ts
*/
import { existsSync, readFileSync } from "fs";
import { Config } from "./config.js";
import {
executeAction,
runAdbCommand,
type ActionDecision,
type ActionResult,
} from "./actions.js";
import { getLlmProvider, type LLMProvider } from "./llm-providers.js";
import {
getInteractiveElements,
computeScreenHash,
type UIElement,
} from "./sanitizer.js";
import {
DEVICE_SCREENSHOT_PATH,
LOCAL_SCREENSHOT_PATH,
} from "./constants.js";
// ===========================================
// Screen Perception
// ===========================================
/**
* Dumps the current UI XML and returns parsed elements + JSON string.
*/
function getScreenState(): { elements: UIElement[]; json: string } {
try {
runAdbCommand(["shell", "uiautomator", "dump", Config.SCREEN_DUMP_PATH]);
runAdbCommand(["pull", Config.SCREEN_DUMP_PATH, Config.LOCAL_DUMP_PATH]);
} catch {
console.log("Warning: ADB screen capture failed.");
return { elements: [], json: "Error: Could not capture screen." };
}
if (!existsSync(Config.LOCAL_DUMP_PATH)) {
return { elements: [], json: "Error: Could not capture screen." };
}
const xmlContent = readFileSync(Config.LOCAL_DUMP_PATH, "utf-8");
const elements = getInteractiveElements(xmlContent);
return { elements, json: JSON.stringify(elements, null, 2) };
}
/**
* Captures a screenshot and returns the local file path.
* Used as a vision fallback when the accessibility tree is empty.
*/
function captureScreenshot(): string | null {
try {
runAdbCommand(["shell", "screencap", "-p", DEVICE_SCREENSHOT_PATH]);
runAdbCommand(["pull", DEVICE_SCREENSHOT_PATH, LOCAL_SCREENSHOT_PATH]);
if (existsSync(LOCAL_SCREENSHOT_PATH)) {
return LOCAL_SCREENSHOT_PATH;
}
} catch {
console.log("Warning: Screenshot capture failed.");
}
return null;
}
// ===========================================
// Screen State Diffing
// ===========================================
interface ScreenDiff {
changed: boolean;
addedTexts: string[];
removedTexts: string[];
summary: string;
}
function diffScreenState(
prevElements: UIElement[],
currElements: UIElement[]
): ScreenDiff {
const prevTexts = new Set(prevElements.map((e) => e.text).filter(Boolean));
const currTexts = new Set(currElements.map((e) => e.text).filter(Boolean));
const addedTexts = [...currTexts].filter((t) => !prevTexts.has(t));
const removedTexts = [...prevTexts].filter((t) => !currTexts.has(t));
const prevHash = computeScreenHash(prevElements);
const currHash = computeScreenHash(currElements);
const changed = prevHash !== currHash;
let summary = "";
if (!changed) {
summary = "Screen has NOT changed since last action.";
} else {
const parts: string[] = [];
if (addedTexts.length > 0) {
parts.push(`New on screen: ${addedTexts.slice(0, 5).join(", ")}`);
}
if (removedTexts.length > 0) {
parts.push(`Gone from screen: ${removedTexts.slice(0, 5).join(", ")}`);
}
summary = parts.join(". ") || "Screen layout changed.";
}
return { changed, addedTexts, removedTexts, summary };
}
// ===========================================
// Action History Formatting
// ===========================================
function formatActionHistory(
actionHistory: ActionDecision[],
resultHistory: ActionResult[]
): string {
if (actionHistory.length === 0) return "";
const lines = actionHistory.map((entry, i) => {
const actionType = entry.action ?? "unknown";
const reason = entry.reason ?? "N/A";
const result = resultHistory[i];
const outcome = result ? (result.success ? "OK" : "FAILED") : "";
if (actionType === "type") {
return `Step ${i + 1}: typed "${entry.text ?? ""}" - ${reason} [${outcome}]`;
}
if (actionType === "tap") {
return `Step ${i + 1}: tapped ${JSON.stringify(entry.coordinates ?? [])} - ${reason} [${outcome}]`;
}
if (actionType === "launch") {
return `Step ${i + 1}: launched ${entry.package ?? entry.uri ?? ""} - ${reason} [${outcome}]`;
}
if (actionType === "screenshot") {
return `Step ${i + 1}: took screenshot - ${reason} [${outcome}]`;
}
return `Step ${i + 1}: ${actionType} - ${reason} [${outcome}]`;
});
return "\n\nPREVIOUS_ACTIONS:\n" + lines.join("\n");
}
// ===========================================
// Main Agent Loop
// ===========================================
async function runAgent(goal: string, maxSteps?: number): Promise<void> {
const steps = maxSteps ?? Config.MAX_STEPS;
console.log("Android Action Kernel Started");
console.log(`Goal: ${goal}`);
console.log(`Provider: ${Config.LLM_PROVIDER} (${Config.getModel()})`);
console.log(`Max steps: ${steps} | Step delay: ${Config.STEP_DELAY}s`);
console.log(`Vision fallback: ${Config.VISION_ENABLED ? "ON" : "OFF"}`);
const llm = getLlmProvider();
const actionHistory: ActionDecision[] = [];
const resultHistory: ActionResult[] = [];
let prevElements: UIElement[] = [];
let stuckCount = 0;
for (let step = 0; step < steps; step++) {
console.log(`\n--- Step ${step + 1}/${steps} ---`);
// 1. Perception: Capture screen state
console.log("Scanning screen...");
const { elements, json: screenContext } = getScreenState();
// 2. Screen diff: detect stuck loops
let diffContext = "";
if (step > 0) {
const diff = diffScreenState(prevElements, elements);
diffContext = `\n\nSCREEN_CHANGE: ${diff.summary}`;
if (!diff.changed) {
stuckCount++;
console.log(
`Warning: Screen unchanged for ${stuckCount} step(s).`
);
if (stuckCount >= Config.STUCK_THRESHOLD) {
console.log(
`Stuck for ${stuckCount} steps. Injecting recovery hint.`
);
diffContext +=
`\nWARNING: You have been stuck for ${stuckCount} steps. ` +
`The screen is NOT changing. Try a DIFFERENT action: ` +
`swipe to scroll, press back, go home, or launch a different app.`;
}
} else {
stuckCount = 0;
}
}
prevElements = elements;
// 3. Vision fallback: if accessibility tree is empty, use screenshot
let visionContext = "";
if (elements.length === 0 && Config.VISION_ENABLED) {
console.log("Accessibility tree empty. Attempting vision fallback...");
const screenshotPath = captureScreenshot();
if (screenshotPath) {
visionContext =
"\n\nVISION_FALLBACK: The accessibility tree returned NO elements. " +
"A screenshot has been captured. The screen likely contains custom-drawn " +
"content (game, WebView, or Flutter). Try using coordinate-based taps on " +
"common UI positions, or use 'back'/'home' to navigate away. " +
"If you know the app package name, use 'launch' to restart it.";
console.log("Vision fallback: screenshot captured for context.");
}
}
// 4. Reasoning: Get LLM decision
console.log("Thinking...");
const historyStr = formatActionHistory(actionHistory, resultHistory);
const fullContext = screenContext + historyStr + diffContext + visionContext;
let decision: ActionDecision;
try {
decision = await llm.getDecision(goal, fullContext, actionHistory);
} catch (err) {
console.log(`LLM Error: ${(err as Error).message}`);
console.log("Falling back to wait action.");
decision = { action: "wait", reason: "LLM request failed, waiting for retry" };
}
console.log(`Decision: ${decision.action}${decision.reason ?? "no reason"}`);
// 5. Action: Execute the decision
let result: ActionResult;
try {
result = executeAction(decision);
} catch (err) {
console.log(`Action Error: ${(err as Error).message}`);
result = { success: false, message: (err as Error).message };
}
// Track history
actionHistory.push(decision);
resultHistory.push(result);
// 6. Check for goal completion
if (decision.action === "done") {
console.log("\nTask completed successfully.");
return;
}
// Wait for UI to update
await Bun.sleep(Config.STEP_DELAY * 1000);
}
console.log("\nMax steps reached. Task may be incomplete.");
}
// ===========================================
// Entry Point
// ===========================================
async function main(): Promise<void> {
try {
Config.validate();
} catch (e) {
console.log(`Configuration Error: ${(e as Error).message}`);
return;
}
// Read user input from stdin
process.stdout.write("Enter your goal: ");
const goal = await new Promise<string>((resolve) => {
const reader = Bun.stdin.stream().getReader();
reader.read().then(({ value }) => {
resolve(new TextDecoder().decode(value).trim());
reader.releaseLock();
});
});
if (!goal) {
console.log("No goal provided. Exiting.");
return;
}
await runAgent(goal);
}
main();

View File

@@ -0,0 +1,327 @@
/**
* LLM Provider module for Android Action Kernel.
* Supports OpenAI, Groq, AWS Bedrock, and OpenRouter (via Vercel AI SDK).
*/
import OpenAI from "openai";
import {
BedrockRuntimeClient,
InvokeModelCommand,
} from "@aws-sdk/client-bedrock-runtime";
import { generateText } from "ai";
import { createOpenRouter } from "@openrouter/ai-sdk-provider";
import { Config } from "./config.js";
import {
GROQ_API_BASE_URL,
BEDROCK_ANTHROPIC_MODELS,
BEDROCK_META_MODELS,
} from "./constants.js";
import type { ActionDecision } from "./actions.js";
// ===========================================
// System Prompt — all 15 actions + rich element context
// ===========================================
const SYSTEM_PROMPT = `You are an Android Driver Agent. Your job is to achieve the user's goal by navigating the Android UI.
You will receive:
1. GOAL — the user's task.
2. SCREEN_CONTEXT — JSON array of interactive UI elements with coordinates, states, and hierarchy.
3. PREVIOUS_ACTIONS — your action history with outcomes (OK/FAILED).
4. SCREEN_CHANGE — what changed since your last action (or if the screen is stuck).
5. VISION_FALLBACK — present when the accessibility tree is empty (custom UI / WebView).
You must output ONLY a valid JSON object with your next action.
═══════════════════════════════════════════
AVAILABLE ACTIONS (15 total)
═══════════════════════════════════════════
Navigation:
{"action": "tap", "coordinates": [x, y], "reason": "..."}
{"action": "longpress", "coordinates": [x, y], "reason": "..."}
{"action": "swipe", "direction": "up|down|left|right", "reason": "..."}
{"action": "enter", "reason": "Press Enter/submit"}
{"action": "back", "reason": "Navigate back"}
{"action": "home", "reason": "Go to home screen"}
Text Input:
{"action": "type", "text": "Hello World", "reason": "..."}
{"action": "clear", "reason": "Clear current text field before typing"}
App Control:
{"action": "launch", "package": "com.whatsapp", "reason": "Open WhatsApp"}
{"action": "launch", "uri": "https://maps.google.com/?q=pizza", "reason": "Open URL"}
{"action": "launch", "package": "com.whatsapp", "uri": "content://media/external/images/1", "extras": {"android.intent.extra.TEXT": "Check this"}, "reason": "Share image to WhatsApp"}
Data:
{"action": "screenshot", "reason": "Capture current screen"}
{"action": "screenshot", "filename": "order_confirmation.png", "reason": "Save proof"}
{"action": "clipboard_get", "reason": "Read clipboard contents"}
{"action": "clipboard_set", "text": "copied text", "reason": "Set clipboard"}
System:
{"action": "shell", "command": "am force-stop com.app.broken", "reason": "Kill crashed app"}
{"action": "wait", "reason": "Wait for screen to load"}
{"action": "done", "reason": "Task is complete"}
═══════════════════════════════════════════
ELEMENT PROPERTIES YOU WILL SEE
═══════════════════════════════════════════
Each element in SCREEN_CONTEXT has:
- text: visible label or content description
- center: [x, y] coordinates to tap
- size: [width, height] in pixels
- enabled: whether the element can be interacted with (DO NOT tap disabled elements!)
- checked: checkbox/toggle state (true = ON)
- focused: whether this field currently has input focus
- selected: whether this item is currently selected (tabs, list items)
- scrollable: whether this container can be scrolled
- longClickable: supports long-press for context menu
- editable: text input field
- password: password input (don't read/log the text)
- hint: placeholder text shown when field is empty
- parent: the containing element (helps understand layout hierarchy)
- action: suggested action — "tap", "type", "longpress", "scroll", or "read"
═══════════════════════════════════════════
CRITICAL RULES
═══════════════════════════════════════════
1. DISABLED ELEMENTS: If "enabled": false, DO NOT tap or interact with it. Find an alternative.
2. TEXT INPUT: If "editable": true, use "clear" first if field has existing text, then "type".
3. ALREADY TYPED: Check PREVIOUS_ACTIONS. Do NOT re-type text you already entered.
4. REPETITION: Do NOT tap the same coordinates twice in a row. If it didn't work, try something else.
5. STUCK: If SCREEN_CHANGE says "NOT changed", your last action had no effect. Change strategy.
6. APP LAUNCH: Use "launch" to directly open apps instead of hunting for icons on the home screen.
7. SCREENSHOTS: Use "screenshot" to capture proof of completed tasks (order confirmations, etc).
8. LONG PRESS: Use "longpress" when you see "longClickable": true (context menus, copy/paste, etc).
9. SCROLLING: If the item you need isn't visible, "swipe" up/down to scroll and find it.
10. MULTI-APP: To switch apps, use "home" then "launch" the next app. Or use "back" to return.
11. PASSWORDS: Never log or output the text of password fields.
12. DONE: Say "done" as soon as the goal is achieved. Don't keep acting after success.
13. SEARCH: After typing in a search field, use "enter" to submit the search.
14. SHARE: To send files/images between apps, use "launch" with uri + extras for Android intents.
15. CLEANUP: If a popup/ad appears, dismiss it with "back" or tap the close button, then continue.`;
// ===========================================
// Provider Interface
// ===========================================
interface ActionHistoryEntry {
action?: string;
reason?: string;
text?: string;
coordinates?: [number, number];
package?: string;
uri?: string;
}
export interface LLMProvider {
getDecision(
goal: string,
screenContext: string,
actionHistory: ActionHistoryEntry[]
): Promise<ActionDecision>;
}
// ===========================================
// OpenAI / Groq Provider
// ===========================================
class OpenAIProvider implements LLMProvider {
private client: OpenAI;
private model: string;
constructor() {
if (Config.LLM_PROVIDER === "groq") {
this.client = new OpenAI({
apiKey: Config.GROQ_API_KEY,
baseURL: GROQ_API_BASE_URL,
});
this.model = Config.GROQ_MODEL;
} else {
this.client = new OpenAI({ apiKey: Config.OPENAI_API_KEY });
this.model = Config.OPENAI_MODEL;
}
}
async getDecision(
goal: string,
screenContext: string,
_actionHistory: ActionHistoryEntry[]
): Promise<ActionDecision> {
// screenContext now includes history, diff, and vision context from kernel
const userContent = `GOAL: ${goal}\n\nSCREEN_CONTEXT:\n${screenContext}`;
const response = await this.client.chat.completions.create({
model: this.model,
response_format: { type: "json_object" },
messages: [
{ role: "system", content: SYSTEM_PROMPT },
{ role: "user", content: userContent },
],
});
return JSON.parse(response.choices[0].message.content ?? "{}");
}
}
// ===========================================
// OpenRouter Provider (Vercel AI SDK)
// ===========================================
class OpenRouterProvider implements LLMProvider {
private openrouter: ReturnType<typeof createOpenRouter>;
private model: string;
constructor() {
this.openrouter = createOpenRouter({
apiKey: Config.OPENROUTER_API_KEY,
});
this.model = Config.OPENROUTER_MODEL;
}
async getDecision(
goal: string,
screenContext: string,
_actionHistory: ActionHistoryEntry[]
): Promise<ActionDecision> {
const userContent = `GOAL: ${goal}\n\nSCREEN_CONTEXT:\n${screenContext}`;
const result = await generateText({
model: this.openrouter.chat(this.model),
system: SYSTEM_PROMPT,
prompt: userContent + "\n\nRespond with ONLY a valid JSON object.",
});
return parseJsonResponse(result.text);
}
}
// ===========================================
// AWS Bedrock Provider
// ===========================================
class BedrockProvider implements LLMProvider {
private client: BedrockRuntimeClient;
private model: string;
constructor() {
this.client = new BedrockRuntimeClient({ region: Config.AWS_REGION });
this.model = Config.BEDROCK_MODEL;
}
async getDecision(
goal: string,
screenContext: string,
_actionHistory: ActionHistoryEntry[]
): Promise<ActionDecision> {
const userContent = `GOAL: ${goal}\n\nSCREEN_CONTEXT:\n${screenContext}`;
const requestBody = this.buildRequest(userContent);
const command = new InvokeModelCommand({
modelId: this.model,
body: new TextEncoder().encode(requestBody),
contentType: "application/json",
accept: "application/json",
});
const response = await this.client.send(command);
const responseBody = JSON.parse(new TextDecoder().decode(response.body));
const resultText = this.extractResponse(responseBody);
return parseJsonResponse(resultText);
}
private isAnthropicModel(): boolean {
return BEDROCK_ANTHROPIC_MODELS.some((id) => this.model.includes(id));
}
private isMetaModel(): boolean {
return BEDROCK_META_MODELS.some((id) =>
this.model.toLowerCase().includes(id)
);
}
private buildRequest(userContent: string): string {
if (this.isAnthropicModel()) {
return JSON.stringify({
anthropic_version: "bedrock-2023-05-31",
max_tokens: 1024,
system: SYSTEM_PROMPT,
messages: [
{
role: "user",
content:
userContent + "\n\nRespond with ONLY a valid JSON object.",
},
],
});
}
if (this.isMetaModel()) {
return JSON.stringify({
prompt: `<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n${SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n${userContent}\n\nRespond with ONLY a valid JSON object, no other text.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n`,
max_gen_len: 512,
temperature: 0.1,
});
}
return JSON.stringify({
inputText: `${SYSTEM_PROMPT}\n\n${userContent}\n\nRespond with ONLY a valid JSON object.`,
textGenerationConfig: {
maxTokenCount: 512,
temperature: 0.1,
},
});
}
private extractResponse(responseBody: Record<string, any>): string {
if (this.isAnthropicModel()) {
return responseBody.content[0].text;
}
if (this.isMetaModel()) {
return responseBody.generation ?? "";
}
return responseBody.results[0].outputText;
}
}
// ===========================================
// Shared JSON Parsing
// ===========================================
function parseJsonResponse(text: string): ActionDecision {
try {
return JSON.parse(text);
} catch {
// Try to extract JSON from markdown code blocks or mixed text
const match = text.match(/\{[\s\S]*?\}/);
if (match) {
try {
return JSON.parse(match[0]);
} catch {
// fall through
}
}
console.log(`Warning: Could not parse LLM response: ${text.slice(0, 200)}`);
return { action: "wait", reason: "Failed to parse response, waiting" };
}
}
// ===========================================
// Factory
// ===========================================
export function getLlmProvider(): LLMProvider {
if (Config.LLM_PROVIDER === "bedrock") {
return new BedrockProvider();
}
if (Config.LLM_PROVIDER === "openrouter") {
return new OpenRouterProvider();
}
return new OpenAIProvider();
}

View File

@@ -0,0 +1,171 @@
/**
* XML Sanitizer for Android Action Kernel.
* Parses Android Accessibility XML and extracts interactive UI elements
* with full state information and parent-child hierarchy context.
*/
import { XMLParser } from "fast-xml-parser";
export interface UIElement {
id: string;
text: string;
type: string;
bounds: string;
center: [number, number];
size: [number, number];
clickable: boolean;
editable: boolean;
enabled: boolean;
checked: boolean;
focused: boolean;
selected: boolean;
scrollable: boolean;
longClickable: boolean;
password: boolean;
hint: string;
action: "tap" | "type" | "longpress" | "scroll" | "read";
parent: string;
depth: number;
}
/**
* Compute a hash of element texts/ids for screen state comparison.
*/
export function computeScreenHash(elements: UIElement[]): string {
const parts = elements.map(
(e) => `${e.id}|${e.text}|${e.center[0]},${e.center[1]}|${e.enabled}|${e.checked}`
);
return parts.join(";");
}
/**
* Parses Android Accessibility XML and returns a rich list of interactive elements.
* Preserves state (enabled, checked, focused) and hierarchy context.
*/
export function getInteractiveElements(xmlContent: string): UIElement[] {
const parser = new XMLParser({
ignoreAttributes: false,
attributeNamePrefix: "@_",
allowBooleanAttributes: true,
});
let parsed: unknown;
try {
parsed = parser.parse(xmlContent);
} catch {
console.log("Warning: Error parsing XML. The screen might be loading.");
return [];
}
const elements: UIElement[] = [];
function walk(node: any, parentLabel: string, depth: number): void {
if (!node || typeof node !== "object") return;
if (node["@_bounds"]) {
const isClickable = node["@_clickable"] === "true";
const isLongClickable = node["@_long-clickable"] === "true";
const isScrollable = node["@_scrollable"] === "true";
const isEnabled = node["@_enabled"] !== "false"; // default true
const isChecked = node["@_checked"] === "true";
const isFocused = node["@_focused"] === "true";
const isSelected = node["@_selected"] === "true";
const isPassword = node["@_password"] === "true";
const elementClass = node["@_class"] ?? "";
const isEditable =
elementClass.includes("EditText") ||
elementClass.includes("AutoCompleteTextView") ||
node["@_editable"] === "true";
const text: string = node["@_text"] ?? "";
const desc: string = node["@_content-desc"] ?? "";
const resourceId: string = node["@_resource-id"] ?? "";
const hint: string = node["@_hint"] ?? "";
// Build a label for this node to use as parent context for children
const typeName = elementClass.split(".").pop() ?? "";
const nodeLabel = text || desc || resourceId.split("/").pop() || typeName;
// Determine if this element should be included
const isInteractive = isClickable || isEditable || isLongClickable || isScrollable;
const hasContent = !!(text || desc);
if (isInteractive || hasContent) {
const bounds: string = node["@_bounds"];
try {
const coords = bounds
.replace("][", ",")
.replace("[", "")
.replace("]", "")
.split(",")
.map(Number);
const [x1, y1, x2, y2] = coords;
const centerX = Math.floor((x1 + x2) / 2);
const centerY = Math.floor((y1 + y2) / 2);
const width = x2 - x1;
const height = y2 - y1;
// Skip zero-size elements (invisible)
if (width <= 0 || height <= 0) {
// still walk children
} else {
let suggestedAction: UIElement["action"];
if (isEditable) suggestedAction = "type";
else if (isLongClickable && !isClickable) suggestedAction = "longpress";
else if (isScrollable && !isClickable) suggestedAction = "scroll";
else if (isClickable) suggestedAction = "tap";
else suggestedAction = "read";
elements.push({
id: resourceId,
text: text || desc,
type: typeName,
bounds,
center: [centerX, centerY],
size: [width, height],
clickable: isClickable,
editable: isEditable,
enabled: isEnabled,
checked: isChecked,
focused: isFocused,
selected: isSelected,
scrollable: isScrollable,
longClickable: isLongClickable,
password: isPassword,
hint: hint,
action: suggestedAction,
parent: parentLabel,
depth,
});
}
} catch {
// Skip malformed bounds
}
}
// Recurse with updated parent label
walkChildren(node, nodeLabel, depth + 1);
return;
}
// No bounds on this node — just recurse
walkChildren(node, parentLabel, depth);
}
function walkChildren(node: any, parentLabel: string, depth: number): void {
if (node.node) {
const children = Array.isArray(node.node) ? node.node : [node.node];
for (const child of children) {
walk(child, parentLabel, depth);
}
}
if (node.hierarchy) {
walk(node.hierarchy, parentLabel, depth);
}
}
walk(parsed, "root", 0);
return elements;
}

View File

@@ -0,0 +1,19 @@
{
"compilerOptions": {
"target": "ES2022",
"module": "ES2022",
"moduleResolution": "bundler",
"strict": true,
"esModuleInterop": true,
"skipLibCheck": true,
"outDir": "dist",
"rootDir": "src",
"types": ["bun-types"],
"resolveJsonModule": true,
"declaration": true,
"declarationMap": true,
"sourceMap": true
},
"include": ["src/**/*.ts"],
"exclude": ["node_modules", "dist"]
}