From 477d99861ca24a6525698d5aa23220f023fa3ac3 Mon Sep 17 00:00:00 2001 From: Sanju Sivalingam Date: Fri, 6 Feb 2026 08:59:43 +0530 Subject: [PATCH] initial commit --- .DS_Store | Bin 0 -> 6148 bytes android-action-kernel/.DS_Store | Bin 0 -> 6148 bytes android-action-kernel/.env.example | 63 ++++ android-action-kernel/.gitignore | 4 + android-action-kernel/package.json | 22 ++ android-action-kernel/src/actions.ts | 322 ++++++++++++++++++++ android-action-kernel/src/config.ts | 82 ++++++ android-action-kernel/src/constants.ts | 78 +++++ android-action-kernel/src/kernel.ts | 298 +++++++++++++++++++ android-action-kernel/src/llm-providers.ts | 327 +++++++++++++++++++++ android-action-kernel/src/sanitizer.ts | 171 +++++++++++ android-action-kernel/tsconfig.json | 19 ++ 12 files changed, 1386 insertions(+) create mode 100644 .DS_Store create mode 100644 android-action-kernel/.DS_Store create mode 100644 android-action-kernel/.env.example create mode 100644 android-action-kernel/.gitignore create mode 100644 android-action-kernel/package.json create mode 100644 android-action-kernel/src/actions.ts create mode 100644 android-action-kernel/src/config.ts create mode 100644 android-action-kernel/src/constants.ts create mode 100644 android-action-kernel/src/kernel.ts create mode 100644 android-action-kernel/src/llm-providers.ts create mode 100644 android-action-kernel/src/sanitizer.ts create mode 100644 android-action-kernel/tsconfig.json diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..fddfd185c7c5c68d7ab71e605dd01797c8576d54 GIT binary patch literal 6148 zcmeHKPfyf96n_J&c0r`TE+WySaW5u-f`4L+F{}$lR}T$Q&GesKFRG;I(K#gwL@icn^8k&dgnF*KZ?i4Kaq~k%(3Gq48!O) zmhDyEv|}gUh)PblC|Yxz2X3M}?z~T%RWmomS?u_XhBCmfQFK{)hc3 zd~Z|FmwZg-`^7Yq8E3n?PxhSZHT(Jp`f`2we17os(3#=kvsH7zb-cBDC}uLn10I%w z4O;O%&TRh$t;FXRyoNXM7T&`L_zYj+JIRnia-NKktK=4$B=<;}JSHAlB|dqk z;yaSh;GE8f`1OEmCVt=r(2rO!B zDU@3WR`LpfD1IZPpe}w4M2w}esIjFGdr*i9MO2|gUonUZN4ur{iW*xARX7kiGUm}E z6MaJ=a&)v?(j7=qVOpmQqzoKnpi?zXasJ==^Zox|P})ivNE!IA7$BKxds68!x<#uC=5StKU zVG)?^#p*XE{GuGOjPA)If*8XKxrbZ~R@OcPMggP1zpH?Fb~7X)aV6yb!RHrV{E#Lu zUdDKxxTRy>d95&rlgY_Xkm<=DJAT5ltUhbCy3RY*#7)|9*^QP(Yk~97OS$VU2DDSR z^D~?!Zop_H6M_cCkd=o4Yw%8)w^<{WxmD$`a=BdH9@*KscKParGqpS6D0cnQ)b4K1 zDikl?cu?O6HyMAaga&*YN$b~`!Yd?lC4s-CR{|f?m3*=7bS=5wzLWi@2JFG1;lcc1 zp->n(J$hzr>}=g0_FR9X8HsD0;USO8VVl+hpR;Q4oYoThED@w`4;_koKpT(uyB4-v z7?a=ckYnA6C}qpdC}_;Bt;IB{{lb^(_CV}MJPhY~93%l}ODi$TTS@3NKS3ZjUtydX?c4!bUhl*u`*8*z9}NbXtDTq+|`!r2dr6~{-zWx zmgZA(K@po26~R3spE8Y5-1GM&UtpZkT&`HUuU!@yC%4EwQYGu8Ltc|NXUYFeo< zR-e(rC}0#Y3LH{Eyf_^!0!tdZ3gtT$D|rM!6rT~&kc-cMk>hAAY3wRQ4;rXcM3oBq zhyf}c`yCxu(%4m~(h10!aU4A}=o1Ra*|Fb|;RH$wO>Gn~3LI3RM>TD+|KI-e`Tt;& zX&D8K0{@i)ke#j0iXV1*Z#}X&vDZ3So?sCs;<^fD3RZeMmPBmDqgbS&&k+Q%q_L|I Rd(g~>fRw>hMuESoz)$=v&hr2O literal 0 HcmV?d00001 diff --git a/android-action-kernel/.env.example b/android-action-kernel/.env.example new file mode 100644 index 0000000..19990e2 --- /dev/null +++ b/android-action-kernel/.env.example @@ -0,0 +1,63 @@ +# Android Action Kernel Configuration (TypeScript/Bun) +# Copy this file to .env and fill in your settings +# cp .env.example .env + +# =========================================== +# Agent Configuration +# =========================================== +MAX_STEPS=30 # Maximum steps before stopping (30 for complex multi-app tasks) +STEP_DELAY=2 # Seconds to wait between steps +MAX_RETRIES=3 # Retries on ADB/network failures +STUCK_THRESHOLD=3 # Steps before stuck-loop recovery kicks in + +# =========================================== +# Vision Fallback (when accessibility tree is empty) +# =========================================== +VISION_ENABLED=true # Auto-capture screenshot when UI elements not found + +# =========================================== +# LLM Provider: "groq", "openai", "bedrock", or "openrouter" +# =========================================== +LLM_PROVIDER=groq + +# =========================================== +# Groq Configuration (Free tier available) +# Get your key at: https://console.groq.com +# =========================================== +GROQ_API_KEY=gsk_your_key_here +GROQ_MODEL=llama-3.3-70b-versatile +# Other models: llama-3.1-8b-instant (faster, higher rate limits) + +# =========================================== +# OpenAI Configuration +# Get your key at: https://platform.openai.com +# =========================================== +OPENAI_API_KEY=sk-your_key_here +OPENAI_MODEL=gpt-4o +# Other models: gpt-4o-mini (faster, cheaper) + +# =========================================== +# AWS Bedrock Configuration +# Uses AWS credential chain (run 'aws configure' first) +# =========================================== +AWS_REGION=us-east-1 +BEDROCK_MODEL=us.meta.llama3-3-70b-instruct-v1:0 +# Other models: +# anthropic.claude-3-sonnet-20240229-v1:0 +# anthropic.claude-3-haiku-20240307-v1:0 +# meta.llama3-8b-instruct-v1:0 + +# =========================================== +# OpenRouter Configuration (via Vercel AI SDK) +# Access 200+ models through a single API +# Get your key at: https://openrouter.ai/keys +# =========================================== +OPENROUTER_API_KEY=sk-or-v1-your_key_here +OPENROUTER_MODEL=anthropic/claude-3.5-sonnet +# Popular models: +# anthropic/claude-3.5-sonnet (best reasoning) +# openai/gpt-4o (multimodal) +# google/gemini-2.0-flash-001 (fast + cheap) +# meta-llama/llama-3.3-70b-instruct (open source) +# mistralai/mistral-large-latest (European) +# deepseek/deepseek-chat (cost efficient) diff --git a/android-action-kernel/.gitignore b/android-action-kernel/.gitignore new file mode 100644 index 0000000..fb16f59 --- /dev/null +++ b/android-action-kernel/.gitignore @@ -0,0 +1,4 @@ +node_modules/ +dist/ +bun.lock +.env diff --git a/android-action-kernel/package.json b/android-action-kernel/package.json new file mode 100644 index 0000000..b0820fe --- /dev/null +++ b/android-action-kernel/package.json @@ -0,0 +1,22 @@ +{ + "name": "android-action-kernel", + "version": "1.0.0", + "description": "AI agent that controls Android devices through the accessibility API - TypeScript/Bun edition", + "type": "module", + "scripts": { + "start": "bun run src/kernel.ts", + "build": "bun build src/kernel.ts --outdir dist --target bun", + "typecheck": "tsc --noEmit" + }, + "dependencies": { + "@aws-sdk/client-bedrock-runtime": "^3.700.0", + "@openrouter/ai-sdk-provider": "^2.1.1", + "ai": "^6.0.72", + "fast-xml-parser": "^4.5.0", + "openai": "^4.73.0" + }, + "devDependencies": { + "@types/bun": "^1.1.0", + "typescript": "^5.6.0" + } +} diff --git a/android-action-kernel/src/actions.ts b/android-action-kernel/src/actions.ts new file mode 100644 index 0000000..14d1839 --- /dev/null +++ b/android-action-kernel/src/actions.ts @@ -0,0 +1,322 @@ +/** + * Action execution module for Android Action Kernel. + * Handles all ADB commands for interacting with Android devices. + * + * Supported actions: + * tap, type, enter, swipe, home, back, wait, done, + * longpress, screenshot, launch, clear, clipboard_get, clipboard_set, shell + */ + +import { Config } from "./config.js"; +import { + KEYCODE_ENTER, + KEYCODE_HOME, + KEYCODE_BACK, + KEYCODE_DEL, + KEYCODE_MOVE_HOME, + KEYCODE_MOVE_END, + SWIPE_COORDS, + SWIPE_DURATION_MS, + LONG_PRESS_DURATION_MS, + DEVICE_SCREENSHOT_PATH, + LOCAL_SCREENSHOT_PATH, +} from "./constants.js"; + +export interface ActionDecision { + action: string; + coordinates?: [number, number]; + text?: string; + direction?: string; + reason?: string; + // launch action + package?: string; + activity?: string; + uri?: string; + extras?: Record; + // shell action + command?: string; + // screenshot action + filename?: string; +} + +export interface ActionResult { + success: boolean; + message: string; + data?: string; +} + +/** + * Executes a shell command via ADB with retry support. + */ +export function runAdbCommand(command: string[], retries = Config.MAX_RETRIES): string { + for (let attempt = 0; attempt <= retries; attempt++) { + const result = Bun.spawnSync([Config.ADB_PATH, ...command], { + stdout: "pipe", + stderr: "pipe", + }); + + const stdout = result.stdout.toString().trim(); + const stderr = result.stderr.toString().trim(); + + if (stderr && stderr.toLowerCase().includes("error")) { + if (attempt < retries) { + const delay = Math.pow(2, attempt) * 1000; + console.log(`ADB Error (attempt ${attempt + 1}/${retries + 1}): ${stderr}`); + console.log(`Retrying in ${delay / 1000}s...`); + Bun.sleepSync(delay); + continue; + } + console.log(`ADB Error (all retries exhausted): ${stderr}`); + } + + return stdout; + } + + return ""; +} + +/** + * Executes the action decided by the LLM. Returns a result for the kernel to track. + */ +export function executeAction(action: ActionDecision): ActionResult { + switch (action.action) { + case "tap": + return executeTap(action); + case "type": + return executeType(action); + case "enter": + return executeEnter(); + case "swipe": + return executeSwipe(action); + case "home": + return executeHome(); + case "back": + return executeBack(); + case "wait": + return executeWait(); + case "done": + return executeDone(action); + case "longpress": + return executeLongPress(action); + case "screenshot": + return executeScreenshot(action); + case "launch": + return executeLaunch(action); + case "clear": + return executeClear(); + case "clipboard_get": + return executeClipboardGet(); + case "clipboard_set": + return executeClipboardSet(action); + case "shell": + return executeShell(action); + default: + console.log(`Warning: Unknown action: ${action.action}`); + return { success: false, message: `Unknown action: ${action.action}` }; + } +} + +// =========================================== +// Original actions (enhanced) +// =========================================== + +function executeTap(action: ActionDecision): ActionResult { + const [x, y] = action.coordinates ?? [0, 0]; + console.log(`Tapping: (${x}, ${y})`); + runAdbCommand(["shell", "input", "tap", String(x), String(y)]); + return { success: true, message: `Tapped (${x}, ${y})` }; +} + +function executeType(action: ActionDecision): ActionResult { + const text = action.text ?? ""; + if (!text) return { success: false, message: "No text to type" }; + // ADB requires %s for spaces, escape special shell characters + const escapedText = text + .replaceAll("\\", "\\\\") + .replaceAll("\"", "\\\"") + .replaceAll("'", "\\'") + .replaceAll(" ", "%s") + .replaceAll("&", "\\&") + .replaceAll("|", "\\|") + .replaceAll(";", "\\;") + .replaceAll("(", "\\(") + .replaceAll(")", "\\)") + .replaceAll("<", "\\<") + .replaceAll(">", "\\>"); + console.log(`Typing: ${text}`); + runAdbCommand(["shell", "input", "text", escapedText]); + return { success: true, message: `Typed "${text}"` }; +} + +function executeEnter(): ActionResult { + console.log("Pressing Enter"); + runAdbCommand(["shell", "input", "keyevent", KEYCODE_ENTER]); + return { success: true, message: "Pressed Enter" }; +} + +function executeSwipe(action: ActionDecision): ActionResult { + const direction = action.direction ?? "up"; + const coords = SWIPE_COORDS[direction] ?? SWIPE_COORDS["up"]; + + console.log(`Swiping ${direction}`); + runAdbCommand([ + "shell", "input", "swipe", + String(coords[0]), String(coords[1]), + String(coords[2]), String(coords[3]), + SWIPE_DURATION_MS, + ]); + return { success: true, message: `Swiped ${direction}` }; +} + +function executeHome(): ActionResult { + console.log("Going Home"); + runAdbCommand(["shell", "input", "keyevent", KEYCODE_HOME]); + return { success: true, message: "Went to home screen" }; +} + +function executeBack(): ActionResult { + console.log("Going Back"); + runAdbCommand(["shell", "input", "keyevent", KEYCODE_BACK]); + return { success: true, message: "Went back" }; +} + +function executeWait(): ActionResult { + console.log("Waiting..."); + Bun.sleepSync(2000); + return { success: true, message: "Waited 2s" }; +} + +function executeDone(action: ActionDecision): ActionResult { + console.log(`Goal Achieved: ${action.reason ?? "Task complete"}`); + return { success: true, message: "done" }; +} + +// =========================================== +// New actions +// =========================================== + +/** + * Long press at coordinates (opens context menus, triggers drag mode, etc.) + */ +function executeLongPress(action: ActionDecision): ActionResult { + const [x, y] = action.coordinates ?? [0, 0]; + console.log(`Long pressing: (${x}, ${y})`); + // A swipe from the same point to the same point with long duration = long press + runAdbCommand([ + "shell", "input", "swipe", + String(x), String(y), String(x), String(y), + LONG_PRESS_DURATION_MS, + ]); + return { success: true, message: `Long pressed (${x}, ${y})` }; +} + +/** + * Captures a screenshot and saves it locally. + */ +function executeScreenshot(action: ActionDecision): ActionResult { + const filename = action.filename ?? LOCAL_SCREENSHOT_PATH; + console.log(`Taking screenshot → ${filename}`); + runAdbCommand(["shell", "screencap", "-p", DEVICE_SCREENSHOT_PATH]); + runAdbCommand(["pull", DEVICE_SCREENSHOT_PATH, filename]); + return { success: true, message: `Screenshot saved to ${filename}`, data: filename }; +} + +/** + * Launches an app by package name, activity, or URI intent. + * + * Examples the LLM can produce: + * { action: "launch", package: "com.whatsapp" } + * { action: "launch", package: "com.whatsapp", activity: ".HomeActivity" } + * { action: "launch", uri: "https://maps.google.com/?q=pizza+near+me" } + * { action: "launch", package: "com.whatsapp", uri: "content://media/external/images/1", + * extras: { "android.intent.extra.TEXT": "Check this out" } } + */ +function executeLaunch(action: ActionDecision): ActionResult { + const args: string[] = ["shell", "am", "start"]; + + if (action.uri) { + args.push("-a", "android.intent.action.VIEW"); + args.push("-d", action.uri); + } + + if (action.package && action.activity) { + args.push("-n", `${action.package}/${action.activity}`); + } else if (action.package) { + // Launch the default activity for the package + const launchResult = runAdbCommand([ + "shell", "monkey", "-p", action.package, "-c", + "android.intent.category.LAUNCHER", "1", + ]); + console.log(`Launching: ${action.package}`); + return { success: true, message: `Launched ${action.package}`, data: launchResult }; + } + + // Attach intent extras + if (action.extras) { + for (const [key, value] of Object.entries(action.extras)) { + args.push("--es", key, value); + } + } + + const label = action.package ?? action.uri ?? "intent"; + console.log(`Launching: ${label}`); + const result = runAdbCommand(args); + return { success: true, message: `Launched ${label}`, data: result }; +} + +/** + * Clears the currently focused text field. + * Selects all text then deletes it. + */ +function executeClear(): ActionResult { + console.log("Clearing text field"); + // Move to end of field + runAdbCommand(["shell", "input", "keyevent", KEYCODE_MOVE_END]); + // Select all: Shift+Home + runAdbCommand(["shell", "input", "keyevent", "--longpress", KEYCODE_MOVE_HOME]); + // Delete selected text + runAdbCommand(["shell", "input", "keyevent", KEYCODE_DEL]); + return { success: true, message: "Cleared text field" }; +} + +/** + * Reads the current clipboard contents. + */ +function executeClipboardGet(): ActionResult { + console.log("Reading clipboard"); + // Use am broadcast to get clipboard via a helper or service log + // On Android 10+, direct clipboard access via ADB is restricted. + // Workaround: dump the clipboard service log + const result = runAdbCommand(["shell", "cmd", "clipboard", "get-text"]); + if (result) { + console.log(`Clipboard: ${result.slice(0, 100)}`); + return { success: true, message: `Clipboard: ${result}`, data: result }; + } + // Fallback for older Android versions + const fallback = runAdbCommand([ + "shell", "service", "call", "clipboard", "2", "i32", "1", + ]); + return { success: true, message: `Clipboard (raw): ${fallback}`, data: fallback }; +} + +/** + * Sets the clipboard to the given text. + */ +function executeClipboardSet(action: ActionDecision): ActionResult { + const text = action.text ?? ""; + if (!text) return { success: false, message: "No text to set on clipboard" }; + console.log(`Setting clipboard: ${text.slice(0, 50)}...`); + runAdbCommand(["shell", "cmd", "clipboard", "set-text", text]); + return { success: true, message: `Clipboard set to "${text.slice(0, 50)}"` }; +} + +/** + * Runs an arbitrary ADB shell command. Use sparingly for edge cases. + */ +function executeShell(action: ActionDecision): ActionResult { + const cmd = action.command ?? ""; + if (!cmd) return { success: false, message: "No command provided" }; + console.log(`Shell: ${cmd}`); + const result = runAdbCommand(["shell", ...cmd.split(" ")]); + return { success: true, message: `Shell output: ${result.slice(0, 200)}`, data: result }; +} diff --git a/android-action-kernel/src/config.ts b/android-action-kernel/src/config.ts new file mode 100644 index 0000000..1f95493 --- /dev/null +++ b/android-action-kernel/src/config.ts @@ -0,0 +1,82 @@ +/** + * Configuration management for Android Action Kernel. + * Bun natively loads .env files — no dotenv needed. + */ + +import { + DEVICE_DUMP_PATH, + LOCAL_DUMP_PATH, + DEVICE_SCREENSHOT_PATH, + LOCAL_SCREENSHOT_PATH, + DEFAULT_MAX_STEPS, + DEFAULT_STEP_DELAY, + DEFAULT_GROQ_MODEL, + DEFAULT_OPENAI_MODEL, + DEFAULT_BEDROCK_MODEL, + DEFAULT_MAX_RETRIES, + DEFAULT_STUCK_THRESHOLD, + DEFAULT_VISION_ENABLED, +} from "./constants.js"; + +function env(key: string, fallback = ""): string { + return process.env[key] ?? fallback; +} + +export const Config = { + // ADB Configuration + ADB_PATH: env("ADB_PATH", "adb"), + SCREEN_DUMP_PATH: DEVICE_DUMP_PATH, + LOCAL_DUMP_PATH: LOCAL_DUMP_PATH, + DEVICE_SCREENSHOT_PATH: DEVICE_SCREENSHOT_PATH, + LOCAL_SCREENSHOT_PATH: LOCAL_SCREENSHOT_PATH, + + // Agent Configuration + MAX_STEPS: parseInt(env("MAX_STEPS", String(DEFAULT_MAX_STEPS)), 10), + STEP_DELAY: parseFloat(env("STEP_DELAY", String(DEFAULT_STEP_DELAY))), + MAX_RETRIES: parseInt(env("MAX_RETRIES", String(DEFAULT_MAX_RETRIES)), 10), + STUCK_THRESHOLD: parseInt(env("STUCK_THRESHOLD", String(DEFAULT_STUCK_THRESHOLD)), 10), + + // Vision fallback (when accessibility tree is empty) + VISION_ENABLED: env("VISION_ENABLED", String(DEFAULT_VISION_ENABLED)) === "true", + + // LLM Provider: "groq", "openai", "bedrock", or "openrouter" + LLM_PROVIDER: env("LLM_PROVIDER", "groq"), + + // Groq Configuration + GROQ_API_KEY: env("GROQ_API_KEY"), + GROQ_MODEL: env("GROQ_MODEL", DEFAULT_GROQ_MODEL), + + // OpenAI Configuration + OPENAI_API_KEY: env("OPENAI_API_KEY"), + OPENAI_MODEL: env("OPENAI_MODEL", DEFAULT_OPENAI_MODEL), + + // AWS Bedrock Configuration + AWS_REGION: env("AWS_REGION", "us-east-1"), + BEDROCK_MODEL: env("BEDROCK_MODEL", DEFAULT_BEDROCK_MODEL), + + // OpenRouter Configuration (via Vercel AI SDK) + OPENROUTER_API_KEY: env("OPENROUTER_API_KEY"), + OPENROUTER_MODEL: env("OPENROUTER_MODEL", "anthropic/claude-3.5-sonnet"), + + getModel(): string { + const provider = Config.LLM_PROVIDER; + if (provider === "groq") return Config.GROQ_MODEL; + if (provider === "bedrock") return Config.BEDROCK_MODEL; + if (provider === "openrouter") return Config.OPENROUTER_MODEL; + return Config.OPENAI_MODEL; + }, + + validate(): void { + const provider = Config.LLM_PROVIDER; + if (provider === "groq" && !Config.GROQ_API_KEY) { + throw new Error("GROQ_API_KEY is required when using Groq provider"); + } + if (provider === "openai" && !Config.OPENAI_API_KEY) { + throw new Error("OPENAI_API_KEY is required when using OpenAI provider"); + } + if (provider === "openrouter" && !Config.OPENROUTER_API_KEY) { + throw new Error("OPENROUTER_API_KEY is required when using OpenRouter provider"); + } + // Bedrock uses AWS credential chain, no explicit validation needed + }, +}; diff --git a/android-action-kernel/src/constants.ts b/android-action-kernel/src/constants.ts new file mode 100644 index 0000000..498468a --- /dev/null +++ b/android-action-kernel/src/constants.ts @@ -0,0 +1,78 @@ +/** + * Constants for Android Action Kernel. + * All magic strings, URLs, and fixed values in one place. + */ + +// =========================================== +// API Endpoints +// =========================================== +export const GROQ_API_BASE_URL = "https://api.groq.com/openai/v1"; + +// =========================================== +// ADB Key Codes +// =========================================== +export const KEYCODE_ENTER = "66"; +export const KEYCODE_HOME = "KEYCODE_HOME"; +export const KEYCODE_BACK = "KEYCODE_BACK"; +export const KEYCODE_DEL = "67"; +export const KEYCODE_FORWARD_DEL = "112"; +export const KEYCODE_MOVE_HOME = "122"; +export const KEYCODE_MOVE_END = "123"; +export const KEYCODE_MENU = "82"; +export const KEYCODE_TAB = "61"; +export const KEYCODE_ESCAPE = "111"; +export const KEYCODE_DPAD_UP = "19"; +export const KEYCODE_DPAD_DOWN = "20"; +export const KEYCODE_DPAD_LEFT = "21"; +export const KEYCODE_DPAD_RIGHT = "22"; +export const KEYCODE_VOLUME_UP = "24"; +export const KEYCODE_VOLUME_DOWN = "25"; +export const KEYCODE_POWER = "26"; + +// =========================================== +// Default Screen Coordinates (for swipe actions) +// Adjust based on target device resolution +// =========================================== +export const SCREEN_CENTER_X = 540; +export const SCREEN_CENTER_Y = 1200; + +// Swipe coordinates: [start_x, start_y, end_x, end_y] +export const SWIPE_COORDS: Record = { + up: [SCREEN_CENTER_X, 1500, SCREEN_CENTER_X, 500], + down: [SCREEN_CENTER_X, 500, SCREEN_CENTER_X, 1500], + left: [800, SCREEN_CENTER_Y, 200, SCREEN_CENTER_Y], + right: [200, SCREEN_CENTER_Y, 800, SCREEN_CENTER_Y], +}; +export const SWIPE_DURATION_MS = "300"; +export const LONG_PRESS_DURATION_MS = "1000"; + +// =========================================== +// Default Models +// =========================================== +export const DEFAULT_GROQ_MODEL = "llama-3.3-70b-versatile"; +export const DEFAULT_OPENAI_MODEL = "gpt-4o"; +export const DEFAULT_BEDROCK_MODEL = "us.meta.llama3-3-70b-instruct-v1:0"; +export const DEFAULT_OPENROUTER_MODEL = "anthropic/claude-3.5-sonnet"; + +// =========================================== +// Bedrock Model Identifiers +// =========================================== +export const BEDROCK_ANTHROPIC_MODELS = ["anthropic"]; +export const BEDROCK_META_MODELS = ["meta", "llama"]; + +// =========================================== +// File Paths +// =========================================== +export const DEVICE_DUMP_PATH = "/sdcard/window_dump.xml"; +export const LOCAL_DUMP_PATH = "window_dump.xml"; +export const DEVICE_SCREENSHOT_PATH = "/sdcard/kernel_screenshot.png"; +export const LOCAL_SCREENSHOT_PATH = "kernel_screenshot.png"; + +// =========================================== +// Agent Defaults +// =========================================== +export const DEFAULT_MAX_STEPS = 30; +export const DEFAULT_STEP_DELAY = 2.0; +export const DEFAULT_MAX_RETRIES = 3; +export const DEFAULT_STUCK_THRESHOLD = 3; +export const DEFAULT_VISION_ENABLED = true; diff --git a/android-action-kernel/src/kernel.ts b/android-action-kernel/src/kernel.ts new file mode 100644 index 0000000..fdb36bf --- /dev/null +++ b/android-action-kernel/src/kernel.ts @@ -0,0 +1,298 @@ +/** + * Android Action Kernel - Main Agent Loop (TypeScript/Bun Edition) + * + * An AI agent that controls Android devices through the accessibility API. + * Uses LLMs to make decisions based on screen context. + * + * Features: + * - Perception → Reasoning → Action loop + * - Screen state diffing (stuck loop detection) + * - Error recovery with retries + * - Vision fallback when accessibility tree is empty + * - Dynamic early exit on goal completion + * - 15 actions: tap, type, enter, swipe, home, back, wait, done, + * longpress, screenshot, launch, clear, clipboard_get, clipboard_set, shell + * + * Usage: + * bun run src/kernel.ts + */ + +import { existsSync, readFileSync } from "fs"; + +import { Config } from "./config.js"; +import { + executeAction, + runAdbCommand, + type ActionDecision, + type ActionResult, +} from "./actions.js"; +import { getLlmProvider, type LLMProvider } from "./llm-providers.js"; +import { + getInteractiveElements, + computeScreenHash, + type UIElement, +} from "./sanitizer.js"; +import { + DEVICE_SCREENSHOT_PATH, + LOCAL_SCREENSHOT_PATH, +} from "./constants.js"; + +// =========================================== +// Screen Perception +// =========================================== + +/** + * Dumps the current UI XML and returns parsed elements + JSON string. + */ +function getScreenState(): { elements: UIElement[]; json: string } { + try { + runAdbCommand(["shell", "uiautomator", "dump", Config.SCREEN_DUMP_PATH]); + runAdbCommand(["pull", Config.SCREEN_DUMP_PATH, Config.LOCAL_DUMP_PATH]); + } catch { + console.log("Warning: ADB screen capture failed."); + return { elements: [], json: "Error: Could not capture screen." }; + } + + if (!existsSync(Config.LOCAL_DUMP_PATH)) { + return { elements: [], json: "Error: Could not capture screen." }; + } + + const xmlContent = readFileSync(Config.LOCAL_DUMP_PATH, "utf-8"); + const elements = getInteractiveElements(xmlContent); + return { elements, json: JSON.stringify(elements, null, 2) }; +} + +/** + * Captures a screenshot and returns the local file path. + * Used as a vision fallback when the accessibility tree is empty. + */ +function captureScreenshot(): string | null { + try { + runAdbCommand(["shell", "screencap", "-p", DEVICE_SCREENSHOT_PATH]); + runAdbCommand(["pull", DEVICE_SCREENSHOT_PATH, LOCAL_SCREENSHOT_PATH]); + if (existsSync(LOCAL_SCREENSHOT_PATH)) { + return LOCAL_SCREENSHOT_PATH; + } + } catch { + console.log("Warning: Screenshot capture failed."); + } + return null; +} + +// =========================================== +// Screen State Diffing +// =========================================== + +interface ScreenDiff { + changed: boolean; + addedTexts: string[]; + removedTexts: string[]; + summary: string; +} + +function diffScreenState( + prevElements: UIElement[], + currElements: UIElement[] +): ScreenDiff { + const prevTexts = new Set(prevElements.map((e) => e.text).filter(Boolean)); + const currTexts = new Set(currElements.map((e) => e.text).filter(Boolean)); + + const addedTexts = [...currTexts].filter((t) => !prevTexts.has(t)); + const removedTexts = [...prevTexts].filter((t) => !currTexts.has(t)); + + const prevHash = computeScreenHash(prevElements); + const currHash = computeScreenHash(currElements); + const changed = prevHash !== currHash; + + let summary = ""; + if (!changed) { + summary = "Screen has NOT changed since last action."; + } else { + const parts: string[] = []; + if (addedTexts.length > 0) { + parts.push(`New on screen: ${addedTexts.slice(0, 5).join(", ")}`); + } + if (removedTexts.length > 0) { + parts.push(`Gone from screen: ${removedTexts.slice(0, 5).join(", ")}`); + } + summary = parts.join(". ") || "Screen layout changed."; + } + + return { changed, addedTexts, removedTexts, summary }; +} + +// =========================================== +// Action History Formatting +// =========================================== + +function formatActionHistory( + actionHistory: ActionDecision[], + resultHistory: ActionResult[] +): string { + if (actionHistory.length === 0) return ""; + + const lines = actionHistory.map((entry, i) => { + const actionType = entry.action ?? "unknown"; + const reason = entry.reason ?? "N/A"; + const result = resultHistory[i]; + const outcome = result ? (result.success ? "OK" : "FAILED") : ""; + + if (actionType === "type") { + return `Step ${i + 1}: typed "${entry.text ?? ""}" - ${reason} [${outcome}]`; + } + if (actionType === "tap") { + return `Step ${i + 1}: tapped ${JSON.stringify(entry.coordinates ?? [])} - ${reason} [${outcome}]`; + } + if (actionType === "launch") { + return `Step ${i + 1}: launched ${entry.package ?? entry.uri ?? ""} - ${reason} [${outcome}]`; + } + if (actionType === "screenshot") { + return `Step ${i + 1}: took screenshot - ${reason} [${outcome}]`; + } + return `Step ${i + 1}: ${actionType} - ${reason} [${outcome}]`; + }); + + return "\n\nPREVIOUS_ACTIONS:\n" + lines.join("\n"); +} + +// =========================================== +// Main Agent Loop +// =========================================== + +async function runAgent(goal: string, maxSteps?: number): Promise { + const steps = maxSteps ?? Config.MAX_STEPS; + + console.log("Android Action Kernel Started"); + console.log(`Goal: ${goal}`); + console.log(`Provider: ${Config.LLM_PROVIDER} (${Config.getModel()})`); + console.log(`Max steps: ${steps} | Step delay: ${Config.STEP_DELAY}s`); + console.log(`Vision fallback: ${Config.VISION_ENABLED ? "ON" : "OFF"}`); + + const llm = getLlmProvider(); + const actionHistory: ActionDecision[] = []; + const resultHistory: ActionResult[] = []; + let prevElements: UIElement[] = []; + let stuckCount = 0; + + for (let step = 0; step < steps; step++) { + console.log(`\n--- Step ${step + 1}/${steps} ---`); + + // 1. Perception: Capture screen state + console.log("Scanning screen..."); + const { elements, json: screenContext } = getScreenState(); + + // 2. Screen diff: detect stuck loops + let diffContext = ""; + if (step > 0) { + const diff = diffScreenState(prevElements, elements); + diffContext = `\n\nSCREEN_CHANGE: ${diff.summary}`; + + if (!diff.changed) { + stuckCount++; + console.log( + `Warning: Screen unchanged for ${stuckCount} step(s).` + ); + if (stuckCount >= Config.STUCK_THRESHOLD) { + console.log( + `Stuck for ${stuckCount} steps. Injecting recovery hint.` + ); + diffContext += + `\nWARNING: You have been stuck for ${stuckCount} steps. ` + + `The screen is NOT changing. Try a DIFFERENT action: ` + + `swipe to scroll, press back, go home, or launch a different app.`; + } + } else { + stuckCount = 0; + } + } + prevElements = elements; + + // 3. Vision fallback: if accessibility tree is empty, use screenshot + let visionContext = ""; + if (elements.length === 0 && Config.VISION_ENABLED) { + console.log("Accessibility tree empty. Attempting vision fallback..."); + const screenshotPath = captureScreenshot(); + if (screenshotPath) { + visionContext = + "\n\nVISION_FALLBACK: The accessibility tree returned NO elements. " + + "A screenshot has been captured. The screen likely contains custom-drawn " + + "content (game, WebView, or Flutter). Try using coordinate-based taps on " + + "common UI positions, or use 'back'/'home' to navigate away. " + + "If you know the app package name, use 'launch' to restart it."; + console.log("Vision fallback: screenshot captured for context."); + } + } + + // 4. Reasoning: Get LLM decision + console.log("Thinking..."); + const historyStr = formatActionHistory(actionHistory, resultHistory); + const fullContext = screenContext + historyStr + diffContext + visionContext; + + let decision: ActionDecision; + try { + decision = await llm.getDecision(goal, fullContext, actionHistory); + } catch (err) { + console.log(`LLM Error: ${(err as Error).message}`); + console.log("Falling back to wait action."); + decision = { action: "wait", reason: "LLM request failed, waiting for retry" }; + } + + console.log(`Decision: ${decision.action} — ${decision.reason ?? "no reason"}`); + + // 5. Action: Execute the decision + let result: ActionResult; + try { + result = executeAction(decision); + } catch (err) { + console.log(`Action Error: ${(err as Error).message}`); + result = { success: false, message: (err as Error).message }; + } + + // Track history + actionHistory.push(decision); + resultHistory.push(result); + + // 6. Check for goal completion + if (decision.action === "done") { + console.log("\nTask completed successfully."); + return; + } + + // Wait for UI to update + await Bun.sleep(Config.STEP_DELAY * 1000); + } + + console.log("\nMax steps reached. Task may be incomplete."); +} + +// =========================================== +// Entry Point +// =========================================== + +async function main(): Promise { + try { + Config.validate(); + } catch (e) { + console.log(`Configuration Error: ${(e as Error).message}`); + return; + } + + // Read user input from stdin + process.stdout.write("Enter your goal: "); + const goal = await new Promise((resolve) => { + const reader = Bun.stdin.stream().getReader(); + reader.read().then(({ value }) => { + resolve(new TextDecoder().decode(value).trim()); + reader.releaseLock(); + }); + }); + + if (!goal) { + console.log("No goal provided. Exiting."); + return; + } + + await runAgent(goal); +} + +main(); diff --git a/android-action-kernel/src/llm-providers.ts b/android-action-kernel/src/llm-providers.ts new file mode 100644 index 0000000..64a654b --- /dev/null +++ b/android-action-kernel/src/llm-providers.ts @@ -0,0 +1,327 @@ +/** + * LLM Provider module for Android Action Kernel. + * Supports OpenAI, Groq, AWS Bedrock, and OpenRouter (via Vercel AI SDK). + */ + +import OpenAI from "openai"; +import { + BedrockRuntimeClient, + InvokeModelCommand, +} from "@aws-sdk/client-bedrock-runtime"; +import { generateText } from "ai"; +import { createOpenRouter } from "@openrouter/ai-sdk-provider"; + +import { Config } from "./config.js"; +import { + GROQ_API_BASE_URL, + BEDROCK_ANTHROPIC_MODELS, + BEDROCK_META_MODELS, +} from "./constants.js"; +import type { ActionDecision } from "./actions.js"; + +// =========================================== +// System Prompt — all 15 actions + rich element context +// =========================================== + +const SYSTEM_PROMPT = `You are an Android Driver Agent. Your job is to achieve the user's goal by navigating the Android UI. + +You will receive: +1. GOAL — the user's task. +2. SCREEN_CONTEXT — JSON array of interactive UI elements with coordinates, states, and hierarchy. +3. PREVIOUS_ACTIONS — your action history with outcomes (OK/FAILED). +4. SCREEN_CHANGE — what changed since your last action (or if the screen is stuck). +5. VISION_FALLBACK — present when the accessibility tree is empty (custom UI / WebView). + +You must output ONLY a valid JSON object with your next action. + +═══════════════════════════════════════════ +AVAILABLE ACTIONS (15 total) +═══════════════════════════════════════════ + +Navigation: + {"action": "tap", "coordinates": [x, y], "reason": "..."} + {"action": "longpress", "coordinates": [x, y], "reason": "..."} + {"action": "swipe", "direction": "up|down|left|right", "reason": "..."} + {"action": "enter", "reason": "Press Enter/submit"} + {"action": "back", "reason": "Navigate back"} + {"action": "home", "reason": "Go to home screen"} + +Text Input: + {"action": "type", "text": "Hello World", "reason": "..."} + {"action": "clear", "reason": "Clear current text field before typing"} + +App Control: + {"action": "launch", "package": "com.whatsapp", "reason": "Open WhatsApp"} + {"action": "launch", "uri": "https://maps.google.com/?q=pizza", "reason": "Open URL"} + {"action": "launch", "package": "com.whatsapp", "uri": "content://media/external/images/1", "extras": {"android.intent.extra.TEXT": "Check this"}, "reason": "Share image to WhatsApp"} + +Data: + {"action": "screenshot", "reason": "Capture current screen"} + {"action": "screenshot", "filename": "order_confirmation.png", "reason": "Save proof"} + {"action": "clipboard_get", "reason": "Read clipboard contents"} + {"action": "clipboard_set", "text": "copied text", "reason": "Set clipboard"} + +System: + {"action": "shell", "command": "am force-stop com.app.broken", "reason": "Kill crashed app"} + {"action": "wait", "reason": "Wait for screen to load"} + {"action": "done", "reason": "Task is complete"} + +═══════════════════════════════════════════ +ELEMENT PROPERTIES YOU WILL SEE +═══════════════════════════════════════════ + +Each element in SCREEN_CONTEXT has: +- text: visible label or content description +- center: [x, y] coordinates to tap +- size: [width, height] in pixels +- enabled: whether the element can be interacted with (DO NOT tap disabled elements!) +- checked: checkbox/toggle state (true = ON) +- focused: whether this field currently has input focus +- selected: whether this item is currently selected (tabs, list items) +- scrollable: whether this container can be scrolled +- longClickable: supports long-press for context menu +- editable: text input field +- password: password input (don't read/log the text) +- hint: placeholder text shown when field is empty +- parent: the containing element (helps understand layout hierarchy) +- action: suggested action — "tap", "type", "longpress", "scroll", or "read" + +═══════════════════════════════════════════ +CRITICAL RULES +═══════════════════════════════════════════ + +1. DISABLED ELEMENTS: If "enabled": false, DO NOT tap or interact with it. Find an alternative. +2. TEXT INPUT: If "editable": true, use "clear" first if field has existing text, then "type". +3. ALREADY TYPED: Check PREVIOUS_ACTIONS. Do NOT re-type text you already entered. +4. REPETITION: Do NOT tap the same coordinates twice in a row. If it didn't work, try something else. +5. STUCK: If SCREEN_CHANGE says "NOT changed", your last action had no effect. Change strategy. +6. APP LAUNCH: Use "launch" to directly open apps instead of hunting for icons on the home screen. +7. SCREENSHOTS: Use "screenshot" to capture proof of completed tasks (order confirmations, etc). +8. LONG PRESS: Use "longpress" when you see "longClickable": true (context menus, copy/paste, etc). +9. SCROLLING: If the item you need isn't visible, "swipe" up/down to scroll and find it. +10. MULTI-APP: To switch apps, use "home" then "launch" the next app. Or use "back" to return. +11. PASSWORDS: Never log or output the text of password fields. +12. DONE: Say "done" as soon as the goal is achieved. Don't keep acting after success. +13. SEARCH: After typing in a search field, use "enter" to submit the search. +14. SHARE: To send files/images between apps, use "launch" with uri + extras for Android intents. +15. CLEANUP: If a popup/ad appears, dismiss it with "back" or tap the close button, then continue.`; + +// =========================================== +// Provider Interface +// =========================================== + +interface ActionHistoryEntry { + action?: string; + reason?: string; + text?: string; + coordinates?: [number, number]; + package?: string; + uri?: string; +} + +export interface LLMProvider { + getDecision( + goal: string, + screenContext: string, + actionHistory: ActionHistoryEntry[] + ): Promise; +} + +// =========================================== +// OpenAI / Groq Provider +// =========================================== + +class OpenAIProvider implements LLMProvider { + private client: OpenAI; + private model: string; + + constructor() { + if (Config.LLM_PROVIDER === "groq") { + this.client = new OpenAI({ + apiKey: Config.GROQ_API_KEY, + baseURL: GROQ_API_BASE_URL, + }); + this.model = Config.GROQ_MODEL; + } else { + this.client = new OpenAI({ apiKey: Config.OPENAI_API_KEY }); + this.model = Config.OPENAI_MODEL; + } + } + + async getDecision( + goal: string, + screenContext: string, + _actionHistory: ActionHistoryEntry[] + ): Promise { + // screenContext now includes history, diff, and vision context from kernel + const userContent = `GOAL: ${goal}\n\nSCREEN_CONTEXT:\n${screenContext}`; + + const response = await this.client.chat.completions.create({ + model: this.model, + response_format: { type: "json_object" }, + messages: [ + { role: "system", content: SYSTEM_PROMPT }, + { role: "user", content: userContent }, + ], + }); + + return JSON.parse(response.choices[0].message.content ?? "{}"); + } +} + +// =========================================== +// OpenRouter Provider (Vercel AI SDK) +// =========================================== + +class OpenRouterProvider implements LLMProvider { + private openrouter: ReturnType; + private model: string; + + constructor() { + this.openrouter = createOpenRouter({ + apiKey: Config.OPENROUTER_API_KEY, + }); + this.model = Config.OPENROUTER_MODEL; + } + + async getDecision( + goal: string, + screenContext: string, + _actionHistory: ActionHistoryEntry[] + ): Promise { + const userContent = `GOAL: ${goal}\n\nSCREEN_CONTEXT:\n${screenContext}`; + + const result = await generateText({ + model: this.openrouter.chat(this.model), + system: SYSTEM_PROMPT, + prompt: userContent + "\n\nRespond with ONLY a valid JSON object.", + }); + + return parseJsonResponse(result.text); + } +} + +// =========================================== +// AWS Bedrock Provider +// =========================================== + +class BedrockProvider implements LLMProvider { + private client: BedrockRuntimeClient; + private model: string; + + constructor() { + this.client = new BedrockRuntimeClient({ region: Config.AWS_REGION }); + this.model = Config.BEDROCK_MODEL; + } + + async getDecision( + goal: string, + screenContext: string, + _actionHistory: ActionHistoryEntry[] + ): Promise { + const userContent = `GOAL: ${goal}\n\nSCREEN_CONTEXT:\n${screenContext}`; + const requestBody = this.buildRequest(userContent); + + const command = new InvokeModelCommand({ + modelId: this.model, + body: new TextEncoder().encode(requestBody), + contentType: "application/json", + accept: "application/json", + }); + + const response = await this.client.send(command); + const responseBody = JSON.parse(new TextDecoder().decode(response.body)); + const resultText = this.extractResponse(responseBody); + + return parseJsonResponse(resultText); + } + + private isAnthropicModel(): boolean { + return BEDROCK_ANTHROPIC_MODELS.some((id) => this.model.includes(id)); + } + + private isMetaModel(): boolean { + return BEDROCK_META_MODELS.some((id) => + this.model.toLowerCase().includes(id) + ); + } + + private buildRequest(userContent: string): string { + if (this.isAnthropicModel()) { + return JSON.stringify({ + anthropic_version: "bedrock-2023-05-31", + max_tokens: 1024, + system: SYSTEM_PROMPT, + messages: [ + { + role: "user", + content: + userContent + "\n\nRespond with ONLY a valid JSON object.", + }, + ], + }); + } + + if (this.isMetaModel()) { + return JSON.stringify({ + prompt: `<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n${SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n${userContent}\n\nRespond with ONLY a valid JSON object, no other text.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n`, + max_gen_len: 512, + temperature: 0.1, + }); + } + + return JSON.stringify({ + inputText: `${SYSTEM_PROMPT}\n\n${userContent}\n\nRespond with ONLY a valid JSON object.`, + textGenerationConfig: { + maxTokenCount: 512, + temperature: 0.1, + }, + }); + } + + private extractResponse(responseBody: Record): string { + if (this.isAnthropicModel()) { + return responseBody.content[0].text; + } + if (this.isMetaModel()) { + return responseBody.generation ?? ""; + } + return responseBody.results[0].outputText; + } +} + +// =========================================== +// Shared JSON Parsing +// =========================================== + +function parseJsonResponse(text: string): ActionDecision { + try { + return JSON.parse(text); + } catch { + // Try to extract JSON from markdown code blocks or mixed text + const match = text.match(/\{[\s\S]*?\}/); + if (match) { + try { + return JSON.parse(match[0]); + } catch { + // fall through + } + } + console.log(`Warning: Could not parse LLM response: ${text.slice(0, 200)}`); + return { action: "wait", reason: "Failed to parse response, waiting" }; + } +} + +// =========================================== +// Factory +// =========================================== + +export function getLlmProvider(): LLMProvider { + if (Config.LLM_PROVIDER === "bedrock") { + return new BedrockProvider(); + } + if (Config.LLM_PROVIDER === "openrouter") { + return new OpenRouterProvider(); + } + return new OpenAIProvider(); +} diff --git a/android-action-kernel/src/sanitizer.ts b/android-action-kernel/src/sanitizer.ts new file mode 100644 index 0000000..684b009 --- /dev/null +++ b/android-action-kernel/src/sanitizer.ts @@ -0,0 +1,171 @@ +/** + * XML Sanitizer for Android Action Kernel. + * Parses Android Accessibility XML and extracts interactive UI elements + * with full state information and parent-child hierarchy context. + */ + +import { XMLParser } from "fast-xml-parser"; + +export interface UIElement { + id: string; + text: string; + type: string; + bounds: string; + center: [number, number]; + size: [number, number]; + clickable: boolean; + editable: boolean; + enabled: boolean; + checked: boolean; + focused: boolean; + selected: boolean; + scrollable: boolean; + longClickable: boolean; + password: boolean; + hint: string; + action: "tap" | "type" | "longpress" | "scroll" | "read"; + parent: string; + depth: number; +} + +/** + * Compute a hash of element texts/ids for screen state comparison. + */ +export function computeScreenHash(elements: UIElement[]): string { + const parts = elements.map( + (e) => `${e.id}|${e.text}|${e.center[0]},${e.center[1]}|${e.enabled}|${e.checked}` + ); + return parts.join(";"); +} + +/** + * Parses Android Accessibility XML and returns a rich list of interactive elements. + * Preserves state (enabled, checked, focused) and hierarchy context. + */ +export function getInteractiveElements(xmlContent: string): UIElement[] { + const parser = new XMLParser({ + ignoreAttributes: false, + attributeNamePrefix: "@_", + allowBooleanAttributes: true, + }); + + let parsed: unknown; + try { + parsed = parser.parse(xmlContent); + } catch { + console.log("Warning: Error parsing XML. The screen might be loading."); + return []; + } + + const elements: UIElement[] = []; + + function walk(node: any, parentLabel: string, depth: number): void { + if (!node || typeof node !== "object") return; + + if (node["@_bounds"]) { + const isClickable = node["@_clickable"] === "true"; + const isLongClickable = node["@_long-clickable"] === "true"; + const isScrollable = node["@_scrollable"] === "true"; + const isEnabled = node["@_enabled"] !== "false"; // default true + const isChecked = node["@_checked"] === "true"; + const isFocused = node["@_focused"] === "true"; + const isSelected = node["@_selected"] === "true"; + const isPassword = node["@_password"] === "true"; + + const elementClass = node["@_class"] ?? ""; + const isEditable = + elementClass.includes("EditText") || + elementClass.includes("AutoCompleteTextView") || + node["@_editable"] === "true"; + + const text: string = node["@_text"] ?? ""; + const desc: string = node["@_content-desc"] ?? ""; + const resourceId: string = node["@_resource-id"] ?? ""; + const hint: string = node["@_hint"] ?? ""; + + // Build a label for this node to use as parent context for children + const typeName = elementClass.split(".").pop() ?? ""; + const nodeLabel = text || desc || resourceId.split("/").pop() || typeName; + + // Determine if this element should be included + const isInteractive = isClickable || isEditable || isLongClickable || isScrollable; + const hasContent = !!(text || desc); + + if (isInteractive || hasContent) { + const bounds: string = node["@_bounds"]; + try { + const coords = bounds + .replace("][", ",") + .replace("[", "") + .replace("]", "") + .split(",") + .map(Number); + + const [x1, y1, x2, y2] = coords; + const centerX = Math.floor((x1 + x2) / 2); + const centerY = Math.floor((y1 + y2) / 2); + const width = x2 - x1; + const height = y2 - y1; + + // Skip zero-size elements (invisible) + if (width <= 0 || height <= 0) { + // still walk children + } else { + let suggestedAction: UIElement["action"]; + if (isEditable) suggestedAction = "type"; + else if (isLongClickable && !isClickable) suggestedAction = "longpress"; + else if (isScrollable && !isClickable) suggestedAction = "scroll"; + else if (isClickable) suggestedAction = "tap"; + else suggestedAction = "read"; + + elements.push({ + id: resourceId, + text: text || desc, + type: typeName, + bounds, + center: [centerX, centerY], + size: [width, height], + clickable: isClickable, + editable: isEditable, + enabled: isEnabled, + checked: isChecked, + focused: isFocused, + selected: isSelected, + scrollable: isScrollable, + longClickable: isLongClickable, + password: isPassword, + hint: hint, + action: suggestedAction, + parent: parentLabel, + depth, + }); + } + } catch { + // Skip malformed bounds + } + } + + // Recurse with updated parent label + walkChildren(node, nodeLabel, depth + 1); + return; + } + + // No bounds on this node — just recurse + walkChildren(node, parentLabel, depth); + } + + function walkChildren(node: any, parentLabel: string, depth: number): void { + if (node.node) { + const children = Array.isArray(node.node) ? node.node : [node.node]; + for (const child of children) { + walk(child, parentLabel, depth); + } + } + if (node.hierarchy) { + walk(node.hierarchy, parentLabel, depth); + } + } + + walk(parsed, "root", 0); + return elements; +} diff --git a/android-action-kernel/tsconfig.json b/android-action-kernel/tsconfig.json new file mode 100644 index 0000000..ea8d4f1 --- /dev/null +++ b/android-action-kernel/tsconfig.json @@ -0,0 +1,19 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "ES2022", + "moduleResolution": "bundler", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "outDir": "dist", + "rootDir": "src", + "types": ["bun-types"], + "resolveJsonModule": true, + "declaration": true, + "declarationMap": true, + "sourceMap": true + }, + "include": ["src/**/*.ts"], + "exclude": ["node_modules", "dist"] +}