initial commit
This commit is contained in:
BIN
android-action-kernel/.DS_Store
vendored
Normal file
BIN
android-action-kernel/.DS_Store
vendored
Normal file
Binary file not shown.
63
android-action-kernel/.env.example
Normal file
63
android-action-kernel/.env.example
Normal file
@@ -0,0 +1,63 @@
|
||||
# Android Action Kernel Configuration (TypeScript/Bun)
|
||||
# Copy this file to .env and fill in your settings
|
||||
# cp .env.example .env
|
||||
|
||||
# ===========================================
|
||||
# Agent Configuration
|
||||
# ===========================================
|
||||
MAX_STEPS=30 # Maximum steps before stopping (30 for complex multi-app tasks)
|
||||
STEP_DELAY=2 # Seconds to wait between steps
|
||||
MAX_RETRIES=3 # Retries on ADB/network failures
|
||||
STUCK_THRESHOLD=3 # Steps before stuck-loop recovery kicks in
|
||||
|
||||
# ===========================================
|
||||
# Vision Fallback (when accessibility tree is empty)
|
||||
# ===========================================
|
||||
VISION_ENABLED=true # Auto-capture screenshot when UI elements not found
|
||||
|
||||
# ===========================================
|
||||
# LLM Provider: "groq", "openai", "bedrock", or "openrouter"
|
||||
# ===========================================
|
||||
LLM_PROVIDER=groq
|
||||
|
||||
# ===========================================
|
||||
# Groq Configuration (Free tier available)
|
||||
# Get your key at: https://console.groq.com
|
||||
# ===========================================
|
||||
GROQ_API_KEY=gsk_your_key_here
|
||||
GROQ_MODEL=llama-3.3-70b-versatile
|
||||
# Other models: llama-3.1-8b-instant (faster, higher rate limits)
|
||||
|
||||
# ===========================================
|
||||
# OpenAI Configuration
|
||||
# Get your key at: https://platform.openai.com
|
||||
# ===========================================
|
||||
OPENAI_API_KEY=sk-your_key_here
|
||||
OPENAI_MODEL=gpt-4o
|
||||
# Other models: gpt-4o-mini (faster, cheaper)
|
||||
|
||||
# ===========================================
|
||||
# AWS Bedrock Configuration
|
||||
# Uses AWS credential chain (run 'aws configure' first)
|
||||
# ===========================================
|
||||
AWS_REGION=us-east-1
|
||||
BEDROCK_MODEL=us.meta.llama3-3-70b-instruct-v1:0
|
||||
# Other models:
|
||||
# anthropic.claude-3-sonnet-20240229-v1:0
|
||||
# anthropic.claude-3-haiku-20240307-v1:0
|
||||
# meta.llama3-8b-instruct-v1:0
|
||||
|
||||
# ===========================================
|
||||
# OpenRouter Configuration (via Vercel AI SDK)
|
||||
# Access 200+ models through a single API
|
||||
# Get your key at: https://openrouter.ai/keys
|
||||
# ===========================================
|
||||
OPENROUTER_API_KEY=sk-or-v1-your_key_here
|
||||
OPENROUTER_MODEL=anthropic/claude-3.5-sonnet
|
||||
# Popular models:
|
||||
# anthropic/claude-3.5-sonnet (best reasoning)
|
||||
# openai/gpt-4o (multimodal)
|
||||
# google/gemini-2.0-flash-001 (fast + cheap)
|
||||
# meta-llama/llama-3.3-70b-instruct (open source)
|
||||
# mistralai/mistral-large-latest (European)
|
||||
# deepseek/deepseek-chat (cost efficient)
|
||||
4
android-action-kernel/.gitignore
vendored
Normal file
4
android-action-kernel/.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
node_modules/
|
||||
dist/
|
||||
bun.lock
|
||||
.env
|
||||
22
android-action-kernel/package.json
Normal file
22
android-action-kernel/package.json
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"name": "android-action-kernel",
|
||||
"version": "1.0.0",
|
||||
"description": "AI agent that controls Android devices through the accessibility API - TypeScript/Bun edition",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"start": "bun run src/kernel.ts",
|
||||
"build": "bun build src/kernel.ts --outdir dist --target bun",
|
||||
"typecheck": "tsc --noEmit"
|
||||
},
|
||||
"dependencies": {
|
||||
"@aws-sdk/client-bedrock-runtime": "^3.700.0",
|
||||
"@openrouter/ai-sdk-provider": "^2.1.1",
|
||||
"ai": "^6.0.72",
|
||||
"fast-xml-parser": "^4.5.0",
|
||||
"openai": "^4.73.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/bun": "^1.1.0",
|
||||
"typescript": "^5.6.0"
|
||||
}
|
||||
}
|
||||
322
android-action-kernel/src/actions.ts
Normal file
322
android-action-kernel/src/actions.ts
Normal file
@@ -0,0 +1,322 @@
|
||||
/**
|
||||
* Action execution module for Android Action Kernel.
|
||||
* Handles all ADB commands for interacting with Android devices.
|
||||
*
|
||||
* Supported actions:
|
||||
* tap, type, enter, swipe, home, back, wait, done,
|
||||
* longpress, screenshot, launch, clear, clipboard_get, clipboard_set, shell
|
||||
*/
|
||||
|
||||
import { Config } from "./config.js";
|
||||
import {
|
||||
KEYCODE_ENTER,
|
||||
KEYCODE_HOME,
|
||||
KEYCODE_BACK,
|
||||
KEYCODE_DEL,
|
||||
KEYCODE_MOVE_HOME,
|
||||
KEYCODE_MOVE_END,
|
||||
SWIPE_COORDS,
|
||||
SWIPE_DURATION_MS,
|
||||
LONG_PRESS_DURATION_MS,
|
||||
DEVICE_SCREENSHOT_PATH,
|
||||
LOCAL_SCREENSHOT_PATH,
|
||||
} from "./constants.js";
|
||||
|
||||
export interface ActionDecision {
|
||||
action: string;
|
||||
coordinates?: [number, number];
|
||||
text?: string;
|
||||
direction?: string;
|
||||
reason?: string;
|
||||
// launch action
|
||||
package?: string;
|
||||
activity?: string;
|
||||
uri?: string;
|
||||
extras?: Record<string, string>;
|
||||
// shell action
|
||||
command?: string;
|
||||
// screenshot action
|
||||
filename?: string;
|
||||
}
|
||||
|
||||
export interface ActionResult {
|
||||
success: boolean;
|
||||
message: string;
|
||||
data?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Executes a shell command via ADB with retry support.
|
||||
*/
|
||||
export function runAdbCommand(command: string[], retries = Config.MAX_RETRIES): string {
|
||||
for (let attempt = 0; attempt <= retries; attempt++) {
|
||||
const result = Bun.spawnSync([Config.ADB_PATH, ...command], {
|
||||
stdout: "pipe",
|
||||
stderr: "pipe",
|
||||
});
|
||||
|
||||
const stdout = result.stdout.toString().trim();
|
||||
const stderr = result.stderr.toString().trim();
|
||||
|
||||
if (stderr && stderr.toLowerCase().includes("error")) {
|
||||
if (attempt < retries) {
|
||||
const delay = Math.pow(2, attempt) * 1000;
|
||||
console.log(`ADB Error (attempt ${attempt + 1}/${retries + 1}): ${stderr}`);
|
||||
console.log(`Retrying in ${delay / 1000}s...`);
|
||||
Bun.sleepSync(delay);
|
||||
continue;
|
||||
}
|
||||
console.log(`ADB Error (all retries exhausted): ${stderr}`);
|
||||
}
|
||||
|
||||
return stdout;
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
/**
|
||||
* Executes the action decided by the LLM. Returns a result for the kernel to track.
|
||||
*/
|
||||
export function executeAction(action: ActionDecision): ActionResult {
|
||||
switch (action.action) {
|
||||
case "tap":
|
||||
return executeTap(action);
|
||||
case "type":
|
||||
return executeType(action);
|
||||
case "enter":
|
||||
return executeEnter();
|
||||
case "swipe":
|
||||
return executeSwipe(action);
|
||||
case "home":
|
||||
return executeHome();
|
||||
case "back":
|
||||
return executeBack();
|
||||
case "wait":
|
||||
return executeWait();
|
||||
case "done":
|
||||
return executeDone(action);
|
||||
case "longpress":
|
||||
return executeLongPress(action);
|
||||
case "screenshot":
|
||||
return executeScreenshot(action);
|
||||
case "launch":
|
||||
return executeLaunch(action);
|
||||
case "clear":
|
||||
return executeClear();
|
||||
case "clipboard_get":
|
||||
return executeClipboardGet();
|
||||
case "clipboard_set":
|
||||
return executeClipboardSet(action);
|
||||
case "shell":
|
||||
return executeShell(action);
|
||||
default:
|
||||
console.log(`Warning: Unknown action: ${action.action}`);
|
||||
return { success: false, message: `Unknown action: ${action.action}` };
|
||||
}
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Original actions (enhanced)
|
||||
// ===========================================
|
||||
|
||||
function executeTap(action: ActionDecision): ActionResult {
|
||||
const [x, y] = action.coordinates ?? [0, 0];
|
||||
console.log(`Tapping: (${x}, ${y})`);
|
||||
runAdbCommand(["shell", "input", "tap", String(x), String(y)]);
|
||||
return { success: true, message: `Tapped (${x}, ${y})` };
|
||||
}
|
||||
|
||||
function executeType(action: ActionDecision): ActionResult {
|
||||
const text = action.text ?? "";
|
||||
if (!text) return { success: false, message: "No text to type" };
|
||||
// ADB requires %s for spaces, escape special shell characters
|
||||
const escapedText = text
|
||||
.replaceAll("\\", "\\\\")
|
||||
.replaceAll("\"", "\\\"")
|
||||
.replaceAll("'", "\\'")
|
||||
.replaceAll(" ", "%s")
|
||||
.replaceAll("&", "\\&")
|
||||
.replaceAll("|", "\\|")
|
||||
.replaceAll(";", "\\;")
|
||||
.replaceAll("(", "\\(")
|
||||
.replaceAll(")", "\\)")
|
||||
.replaceAll("<", "\\<")
|
||||
.replaceAll(">", "\\>");
|
||||
console.log(`Typing: ${text}`);
|
||||
runAdbCommand(["shell", "input", "text", escapedText]);
|
||||
return { success: true, message: `Typed "${text}"` };
|
||||
}
|
||||
|
||||
function executeEnter(): ActionResult {
|
||||
console.log("Pressing Enter");
|
||||
runAdbCommand(["shell", "input", "keyevent", KEYCODE_ENTER]);
|
||||
return { success: true, message: "Pressed Enter" };
|
||||
}
|
||||
|
||||
function executeSwipe(action: ActionDecision): ActionResult {
|
||||
const direction = action.direction ?? "up";
|
||||
const coords = SWIPE_COORDS[direction] ?? SWIPE_COORDS["up"];
|
||||
|
||||
console.log(`Swiping ${direction}`);
|
||||
runAdbCommand([
|
||||
"shell", "input", "swipe",
|
||||
String(coords[0]), String(coords[1]),
|
||||
String(coords[2]), String(coords[3]),
|
||||
SWIPE_DURATION_MS,
|
||||
]);
|
||||
return { success: true, message: `Swiped ${direction}` };
|
||||
}
|
||||
|
||||
function executeHome(): ActionResult {
|
||||
console.log("Going Home");
|
||||
runAdbCommand(["shell", "input", "keyevent", KEYCODE_HOME]);
|
||||
return { success: true, message: "Went to home screen" };
|
||||
}
|
||||
|
||||
function executeBack(): ActionResult {
|
||||
console.log("Going Back");
|
||||
runAdbCommand(["shell", "input", "keyevent", KEYCODE_BACK]);
|
||||
return { success: true, message: "Went back" };
|
||||
}
|
||||
|
||||
function executeWait(): ActionResult {
|
||||
console.log("Waiting...");
|
||||
Bun.sleepSync(2000);
|
||||
return { success: true, message: "Waited 2s" };
|
||||
}
|
||||
|
||||
function executeDone(action: ActionDecision): ActionResult {
|
||||
console.log(`Goal Achieved: ${action.reason ?? "Task complete"}`);
|
||||
return { success: true, message: "done" };
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// New actions
|
||||
// ===========================================
|
||||
|
||||
/**
|
||||
* Long press at coordinates (opens context menus, triggers drag mode, etc.)
|
||||
*/
|
||||
function executeLongPress(action: ActionDecision): ActionResult {
|
||||
const [x, y] = action.coordinates ?? [0, 0];
|
||||
console.log(`Long pressing: (${x}, ${y})`);
|
||||
// A swipe from the same point to the same point with long duration = long press
|
||||
runAdbCommand([
|
||||
"shell", "input", "swipe",
|
||||
String(x), String(y), String(x), String(y),
|
||||
LONG_PRESS_DURATION_MS,
|
||||
]);
|
||||
return { success: true, message: `Long pressed (${x}, ${y})` };
|
||||
}
|
||||
|
||||
/**
|
||||
* Captures a screenshot and saves it locally.
|
||||
*/
|
||||
function executeScreenshot(action: ActionDecision): ActionResult {
|
||||
const filename = action.filename ?? LOCAL_SCREENSHOT_PATH;
|
||||
console.log(`Taking screenshot → ${filename}`);
|
||||
runAdbCommand(["shell", "screencap", "-p", DEVICE_SCREENSHOT_PATH]);
|
||||
runAdbCommand(["pull", DEVICE_SCREENSHOT_PATH, filename]);
|
||||
return { success: true, message: `Screenshot saved to ${filename}`, data: filename };
|
||||
}
|
||||
|
||||
/**
|
||||
* Launches an app by package name, activity, or URI intent.
|
||||
*
|
||||
* Examples the LLM can produce:
|
||||
* { action: "launch", package: "com.whatsapp" }
|
||||
* { action: "launch", package: "com.whatsapp", activity: ".HomeActivity" }
|
||||
* { action: "launch", uri: "https://maps.google.com/?q=pizza+near+me" }
|
||||
* { action: "launch", package: "com.whatsapp", uri: "content://media/external/images/1",
|
||||
* extras: { "android.intent.extra.TEXT": "Check this out" } }
|
||||
*/
|
||||
function executeLaunch(action: ActionDecision): ActionResult {
|
||||
const args: string[] = ["shell", "am", "start"];
|
||||
|
||||
if (action.uri) {
|
||||
args.push("-a", "android.intent.action.VIEW");
|
||||
args.push("-d", action.uri);
|
||||
}
|
||||
|
||||
if (action.package && action.activity) {
|
||||
args.push("-n", `${action.package}/${action.activity}`);
|
||||
} else if (action.package) {
|
||||
// Launch the default activity for the package
|
||||
const launchResult = runAdbCommand([
|
||||
"shell", "monkey", "-p", action.package, "-c",
|
||||
"android.intent.category.LAUNCHER", "1",
|
||||
]);
|
||||
console.log(`Launching: ${action.package}`);
|
||||
return { success: true, message: `Launched ${action.package}`, data: launchResult };
|
||||
}
|
||||
|
||||
// Attach intent extras
|
||||
if (action.extras) {
|
||||
for (const [key, value] of Object.entries(action.extras)) {
|
||||
args.push("--es", key, value);
|
||||
}
|
||||
}
|
||||
|
||||
const label = action.package ?? action.uri ?? "intent";
|
||||
console.log(`Launching: ${label}`);
|
||||
const result = runAdbCommand(args);
|
||||
return { success: true, message: `Launched ${label}`, data: result };
|
||||
}
|
||||
|
||||
/**
|
||||
* Clears the currently focused text field.
|
||||
* Selects all text then deletes it.
|
||||
*/
|
||||
function executeClear(): ActionResult {
|
||||
console.log("Clearing text field");
|
||||
// Move to end of field
|
||||
runAdbCommand(["shell", "input", "keyevent", KEYCODE_MOVE_END]);
|
||||
// Select all: Shift+Home
|
||||
runAdbCommand(["shell", "input", "keyevent", "--longpress", KEYCODE_MOVE_HOME]);
|
||||
// Delete selected text
|
||||
runAdbCommand(["shell", "input", "keyevent", KEYCODE_DEL]);
|
||||
return { success: true, message: "Cleared text field" };
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the current clipboard contents.
|
||||
*/
|
||||
function executeClipboardGet(): ActionResult {
|
||||
console.log("Reading clipboard");
|
||||
// Use am broadcast to get clipboard via a helper or service log
|
||||
// On Android 10+, direct clipboard access via ADB is restricted.
|
||||
// Workaround: dump the clipboard service log
|
||||
const result = runAdbCommand(["shell", "cmd", "clipboard", "get-text"]);
|
||||
if (result) {
|
||||
console.log(`Clipboard: ${result.slice(0, 100)}`);
|
||||
return { success: true, message: `Clipboard: ${result}`, data: result };
|
||||
}
|
||||
// Fallback for older Android versions
|
||||
const fallback = runAdbCommand([
|
||||
"shell", "service", "call", "clipboard", "2", "i32", "1",
|
||||
]);
|
||||
return { success: true, message: `Clipboard (raw): ${fallback}`, data: fallback };
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the clipboard to the given text.
|
||||
*/
|
||||
function executeClipboardSet(action: ActionDecision): ActionResult {
|
||||
const text = action.text ?? "";
|
||||
if (!text) return { success: false, message: "No text to set on clipboard" };
|
||||
console.log(`Setting clipboard: ${text.slice(0, 50)}...`);
|
||||
runAdbCommand(["shell", "cmd", "clipboard", "set-text", text]);
|
||||
return { success: true, message: `Clipboard set to "${text.slice(0, 50)}"` };
|
||||
}
|
||||
|
||||
/**
|
||||
* Runs an arbitrary ADB shell command. Use sparingly for edge cases.
|
||||
*/
|
||||
function executeShell(action: ActionDecision): ActionResult {
|
||||
const cmd = action.command ?? "";
|
||||
if (!cmd) return { success: false, message: "No command provided" };
|
||||
console.log(`Shell: ${cmd}`);
|
||||
const result = runAdbCommand(["shell", ...cmd.split(" ")]);
|
||||
return { success: true, message: `Shell output: ${result.slice(0, 200)}`, data: result };
|
||||
}
|
||||
82
android-action-kernel/src/config.ts
Normal file
82
android-action-kernel/src/config.ts
Normal file
@@ -0,0 +1,82 @@
|
||||
/**
|
||||
* Configuration management for Android Action Kernel.
|
||||
* Bun natively loads .env files — no dotenv needed.
|
||||
*/
|
||||
|
||||
import {
|
||||
DEVICE_DUMP_PATH,
|
||||
LOCAL_DUMP_PATH,
|
||||
DEVICE_SCREENSHOT_PATH,
|
||||
LOCAL_SCREENSHOT_PATH,
|
||||
DEFAULT_MAX_STEPS,
|
||||
DEFAULT_STEP_DELAY,
|
||||
DEFAULT_GROQ_MODEL,
|
||||
DEFAULT_OPENAI_MODEL,
|
||||
DEFAULT_BEDROCK_MODEL,
|
||||
DEFAULT_MAX_RETRIES,
|
||||
DEFAULT_STUCK_THRESHOLD,
|
||||
DEFAULT_VISION_ENABLED,
|
||||
} from "./constants.js";
|
||||
|
||||
function env(key: string, fallback = ""): string {
|
||||
return process.env[key] ?? fallback;
|
||||
}
|
||||
|
||||
export const Config = {
|
||||
// ADB Configuration
|
||||
ADB_PATH: env("ADB_PATH", "adb"),
|
||||
SCREEN_DUMP_PATH: DEVICE_DUMP_PATH,
|
||||
LOCAL_DUMP_PATH: LOCAL_DUMP_PATH,
|
||||
DEVICE_SCREENSHOT_PATH: DEVICE_SCREENSHOT_PATH,
|
||||
LOCAL_SCREENSHOT_PATH: LOCAL_SCREENSHOT_PATH,
|
||||
|
||||
// Agent Configuration
|
||||
MAX_STEPS: parseInt(env("MAX_STEPS", String(DEFAULT_MAX_STEPS)), 10),
|
||||
STEP_DELAY: parseFloat(env("STEP_DELAY", String(DEFAULT_STEP_DELAY))),
|
||||
MAX_RETRIES: parseInt(env("MAX_RETRIES", String(DEFAULT_MAX_RETRIES)), 10),
|
||||
STUCK_THRESHOLD: parseInt(env("STUCK_THRESHOLD", String(DEFAULT_STUCK_THRESHOLD)), 10),
|
||||
|
||||
// Vision fallback (when accessibility tree is empty)
|
||||
VISION_ENABLED: env("VISION_ENABLED", String(DEFAULT_VISION_ENABLED)) === "true",
|
||||
|
||||
// LLM Provider: "groq", "openai", "bedrock", or "openrouter"
|
||||
LLM_PROVIDER: env("LLM_PROVIDER", "groq"),
|
||||
|
||||
// Groq Configuration
|
||||
GROQ_API_KEY: env("GROQ_API_KEY"),
|
||||
GROQ_MODEL: env("GROQ_MODEL", DEFAULT_GROQ_MODEL),
|
||||
|
||||
// OpenAI Configuration
|
||||
OPENAI_API_KEY: env("OPENAI_API_KEY"),
|
||||
OPENAI_MODEL: env("OPENAI_MODEL", DEFAULT_OPENAI_MODEL),
|
||||
|
||||
// AWS Bedrock Configuration
|
||||
AWS_REGION: env("AWS_REGION", "us-east-1"),
|
||||
BEDROCK_MODEL: env("BEDROCK_MODEL", DEFAULT_BEDROCK_MODEL),
|
||||
|
||||
// OpenRouter Configuration (via Vercel AI SDK)
|
||||
OPENROUTER_API_KEY: env("OPENROUTER_API_KEY"),
|
||||
OPENROUTER_MODEL: env("OPENROUTER_MODEL", "anthropic/claude-3.5-sonnet"),
|
||||
|
||||
getModel(): string {
|
||||
const provider = Config.LLM_PROVIDER;
|
||||
if (provider === "groq") return Config.GROQ_MODEL;
|
||||
if (provider === "bedrock") return Config.BEDROCK_MODEL;
|
||||
if (provider === "openrouter") return Config.OPENROUTER_MODEL;
|
||||
return Config.OPENAI_MODEL;
|
||||
},
|
||||
|
||||
validate(): void {
|
||||
const provider = Config.LLM_PROVIDER;
|
||||
if (provider === "groq" && !Config.GROQ_API_KEY) {
|
||||
throw new Error("GROQ_API_KEY is required when using Groq provider");
|
||||
}
|
||||
if (provider === "openai" && !Config.OPENAI_API_KEY) {
|
||||
throw new Error("OPENAI_API_KEY is required when using OpenAI provider");
|
||||
}
|
||||
if (provider === "openrouter" && !Config.OPENROUTER_API_KEY) {
|
||||
throw new Error("OPENROUTER_API_KEY is required when using OpenRouter provider");
|
||||
}
|
||||
// Bedrock uses AWS credential chain, no explicit validation needed
|
||||
},
|
||||
};
|
||||
78
android-action-kernel/src/constants.ts
Normal file
78
android-action-kernel/src/constants.ts
Normal file
@@ -0,0 +1,78 @@
|
||||
/**
|
||||
* Constants for Android Action Kernel.
|
||||
* All magic strings, URLs, and fixed values in one place.
|
||||
*/
|
||||
|
||||
// ===========================================
|
||||
// API Endpoints
|
||||
// ===========================================
|
||||
export const GROQ_API_BASE_URL = "https://api.groq.com/openai/v1";
|
||||
|
||||
// ===========================================
|
||||
// ADB Key Codes
|
||||
// ===========================================
|
||||
export const KEYCODE_ENTER = "66";
|
||||
export const KEYCODE_HOME = "KEYCODE_HOME";
|
||||
export const KEYCODE_BACK = "KEYCODE_BACK";
|
||||
export const KEYCODE_DEL = "67";
|
||||
export const KEYCODE_FORWARD_DEL = "112";
|
||||
export const KEYCODE_MOVE_HOME = "122";
|
||||
export const KEYCODE_MOVE_END = "123";
|
||||
export const KEYCODE_MENU = "82";
|
||||
export const KEYCODE_TAB = "61";
|
||||
export const KEYCODE_ESCAPE = "111";
|
||||
export const KEYCODE_DPAD_UP = "19";
|
||||
export const KEYCODE_DPAD_DOWN = "20";
|
||||
export const KEYCODE_DPAD_LEFT = "21";
|
||||
export const KEYCODE_DPAD_RIGHT = "22";
|
||||
export const KEYCODE_VOLUME_UP = "24";
|
||||
export const KEYCODE_VOLUME_DOWN = "25";
|
||||
export const KEYCODE_POWER = "26";
|
||||
|
||||
// ===========================================
|
||||
// Default Screen Coordinates (for swipe actions)
|
||||
// Adjust based on target device resolution
|
||||
// ===========================================
|
||||
export const SCREEN_CENTER_X = 540;
|
||||
export const SCREEN_CENTER_Y = 1200;
|
||||
|
||||
// Swipe coordinates: [start_x, start_y, end_x, end_y]
|
||||
export const SWIPE_COORDS: Record<string, [number, number, number, number]> = {
|
||||
up: [SCREEN_CENTER_X, 1500, SCREEN_CENTER_X, 500],
|
||||
down: [SCREEN_CENTER_X, 500, SCREEN_CENTER_X, 1500],
|
||||
left: [800, SCREEN_CENTER_Y, 200, SCREEN_CENTER_Y],
|
||||
right: [200, SCREEN_CENTER_Y, 800, SCREEN_CENTER_Y],
|
||||
};
|
||||
export const SWIPE_DURATION_MS = "300";
|
||||
export const LONG_PRESS_DURATION_MS = "1000";
|
||||
|
||||
// ===========================================
|
||||
// Default Models
|
||||
// ===========================================
|
||||
export const DEFAULT_GROQ_MODEL = "llama-3.3-70b-versatile";
|
||||
export const DEFAULT_OPENAI_MODEL = "gpt-4o";
|
||||
export const DEFAULT_BEDROCK_MODEL = "us.meta.llama3-3-70b-instruct-v1:0";
|
||||
export const DEFAULT_OPENROUTER_MODEL = "anthropic/claude-3.5-sonnet";
|
||||
|
||||
// ===========================================
|
||||
// Bedrock Model Identifiers
|
||||
// ===========================================
|
||||
export const BEDROCK_ANTHROPIC_MODELS = ["anthropic"];
|
||||
export const BEDROCK_META_MODELS = ["meta", "llama"];
|
||||
|
||||
// ===========================================
|
||||
// File Paths
|
||||
// ===========================================
|
||||
export const DEVICE_DUMP_PATH = "/sdcard/window_dump.xml";
|
||||
export const LOCAL_DUMP_PATH = "window_dump.xml";
|
||||
export const DEVICE_SCREENSHOT_PATH = "/sdcard/kernel_screenshot.png";
|
||||
export const LOCAL_SCREENSHOT_PATH = "kernel_screenshot.png";
|
||||
|
||||
// ===========================================
|
||||
// Agent Defaults
|
||||
// ===========================================
|
||||
export const DEFAULT_MAX_STEPS = 30;
|
||||
export const DEFAULT_STEP_DELAY = 2.0;
|
||||
export const DEFAULT_MAX_RETRIES = 3;
|
||||
export const DEFAULT_STUCK_THRESHOLD = 3;
|
||||
export const DEFAULT_VISION_ENABLED = true;
|
||||
298
android-action-kernel/src/kernel.ts
Normal file
298
android-action-kernel/src/kernel.ts
Normal file
@@ -0,0 +1,298 @@
|
||||
/**
|
||||
* Android Action Kernel - Main Agent Loop (TypeScript/Bun Edition)
|
||||
*
|
||||
* An AI agent that controls Android devices through the accessibility API.
|
||||
* Uses LLMs to make decisions based on screen context.
|
||||
*
|
||||
* Features:
|
||||
* - Perception → Reasoning → Action loop
|
||||
* - Screen state diffing (stuck loop detection)
|
||||
* - Error recovery with retries
|
||||
* - Vision fallback when accessibility tree is empty
|
||||
* - Dynamic early exit on goal completion
|
||||
* - 15 actions: tap, type, enter, swipe, home, back, wait, done,
|
||||
* longpress, screenshot, launch, clear, clipboard_get, clipboard_set, shell
|
||||
*
|
||||
* Usage:
|
||||
* bun run src/kernel.ts
|
||||
*/
|
||||
|
||||
import { existsSync, readFileSync } from "fs";
|
||||
|
||||
import { Config } from "./config.js";
|
||||
import {
|
||||
executeAction,
|
||||
runAdbCommand,
|
||||
type ActionDecision,
|
||||
type ActionResult,
|
||||
} from "./actions.js";
|
||||
import { getLlmProvider, type LLMProvider } from "./llm-providers.js";
|
||||
import {
|
||||
getInteractiveElements,
|
||||
computeScreenHash,
|
||||
type UIElement,
|
||||
} from "./sanitizer.js";
|
||||
import {
|
||||
DEVICE_SCREENSHOT_PATH,
|
||||
LOCAL_SCREENSHOT_PATH,
|
||||
} from "./constants.js";
|
||||
|
||||
// ===========================================
|
||||
// Screen Perception
|
||||
// ===========================================
|
||||
|
||||
/**
|
||||
* Dumps the current UI XML and returns parsed elements + JSON string.
|
||||
*/
|
||||
function getScreenState(): { elements: UIElement[]; json: string } {
|
||||
try {
|
||||
runAdbCommand(["shell", "uiautomator", "dump", Config.SCREEN_DUMP_PATH]);
|
||||
runAdbCommand(["pull", Config.SCREEN_DUMP_PATH, Config.LOCAL_DUMP_PATH]);
|
||||
} catch {
|
||||
console.log("Warning: ADB screen capture failed.");
|
||||
return { elements: [], json: "Error: Could not capture screen." };
|
||||
}
|
||||
|
||||
if (!existsSync(Config.LOCAL_DUMP_PATH)) {
|
||||
return { elements: [], json: "Error: Could not capture screen." };
|
||||
}
|
||||
|
||||
const xmlContent = readFileSync(Config.LOCAL_DUMP_PATH, "utf-8");
|
||||
const elements = getInteractiveElements(xmlContent);
|
||||
return { elements, json: JSON.stringify(elements, null, 2) };
|
||||
}
|
||||
|
||||
/**
|
||||
* Captures a screenshot and returns the local file path.
|
||||
* Used as a vision fallback when the accessibility tree is empty.
|
||||
*/
|
||||
function captureScreenshot(): string | null {
|
||||
try {
|
||||
runAdbCommand(["shell", "screencap", "-p", DEVICE_SCREENSHOT_PATH]);
|
||||
runAdbCommand(["pull", DEVICE_SCREENSHOT_PATH, LOCAL_SCREENSHOT_PATH]);
|
||||
if (existsSync(LOCAL_SCREENSHOT_PATH)) {
|
||||
return LOCAL_SCREENSHOT_PATH;
|
||||
}
|
||||
} catch {
|
||||
console.log("Warning: Screenshot capture failed.");
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Screen State Diffing
|
||||
// ===========================================
|
||||
|
||||
interface ScreenDiff {
|
||||
changed: boolean;
|
||||
addedTexts: string[];
|
||||
removedTexts: string[];
|
||||
summary: string;
|
||||
}
|
||||
|
||||
function diffScreenState(
|
||||
prevElements: UIElement[],
|
||||
currElements: UIElement[]
|
||||
): ScreenDiff {
|
||||
const prevTexts = new Set(prevElements.map((e) => e.text).filter(Boolean));
|
||||
const currTexts = new Set(currElements.map((e) => e.text).filter(Boolean));
|
||||
|
||||
const addedTexts = [...currTexts].filter((t) => !prevTexts.has(t));
|
||||
const removedTexts = [...prevTexts].filter((t) => !currTexts.has(t));
|
||||
|
||||
const prevHash = computeScreenHash(prevElements);
|
||||
const currHash = computeScreenHash(currElements);
|
||||
const changed = prevHash !== currHash;
|
||||
|
||||
let summary = "";
|
||||
if (!changed) {
|
||||
summary = "Screen has NOT changed since last action.";
|
||||
} else {
|
||||
const parts: string[] = [];
|
||||
if (addedTexts.length > 0) {
|
||||
parts.push(`New on screen: ${addedTexts.slice(0, 5).join(", ")}`);
|
||||
}
|
||||
if (removedTexts.length > 0) {
|
||||
parts.push(`Gone from screen: ${removedTexts.slice(0, 5).join(", ")}`);
|
||||
}
|
||||
summary = parts.join(". ") || "Screen layout changed.";
|
||||
}
|
||||
|
||||
return { changed, addedTexts, removedTexts, summary };
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Action History Formatting
|
||||
// ===========================================
|
||||
|
||||
function formatActionHistory(
|
||||
actionHistory: ActionDecision[],
|
||||
resultHistory: ActionResult[]
|
||||
): string {
|
||||
if (actionHistory.length === 0) return "";
|
||||
|
||||
const lines = actionHistory.map((entry, i) => {
|
||||
const actionType = entry.action ?? "unknown";
|
||||
const reason = entry.reason ?? "N/A";
|
||||
const result = resultHistory[i];
|
||||
const outcome = result ? (result.success ? "OK" : "FAILED") : "";
|
||||
|
||||
if (actionType === "type") {
|
||||
return `Step ${i + 1}: typed "${entry.text ?? ""}" - ${reason} [${outcome}]`;
|
||||
}
|
||||
if (actionType === "tap") {
|
||||
return `Step ${i + 1}: tapped ${JSON.stringify(entry.coordinates ?? [])} - ${reason} [${outcome}]`;
|
||||
}
|
||||
if (actionType === "launch") {
|
||||
return `Step ${i + 1}: launched ${entry.package ?? entry.uri ?? ""} - ${reason} [${outcome}]`;
|
||||
}
|
||||
if (actionType === "screenshot") {
|
||||
return `Step ${i + 1}: took screenshot - ${reason} [${outcome}]`;
|
||||
}
|
||||
return `Step ${i + 1}: ${actionType} - ${reason} [${outcome}]`;
|
||||
});
|
||||
|
||||
return "\n\nPREVIOUS_ACTIONS:\n" + lines.join("\n");
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Main Agent Loop
|
||||
// ===========================================
|
||||
|
||||
async function runAgent(goal: string, maxSteps?: number): Promise<void> {
|
||||
const steps = maxSteps ?? Config.MAX_STEPS;
|
||||
|
||||
console.log("Android Action Kernel Started");
|
||||
console.log(`Goal: ${goal}`);
|
||||
console.log(`Provider: ${Config.LLM_PROVIDER} (${Config.getModel()})`);
|
||||
console.log(`Max steps: ${steps} | Step delay: ${Config.STEP_DELAY}s`);
|
||||
console.log(`Vision fallback: ${Config.VISION_ENABLED ? "ON" : "OFF"}`);
|
||||
|
||||
const llm = getLlmProvider();
|
||||
const actionHistory: ActionDecision[] = [];
|
||||
const resultHistory: ActionResult[] = [];
|
||||
let prevElements: UIElement[] = [];
|
||||
let stuckCount = 0;
|
||||
|
||||
for (let step = 0; step < steps; step++) {
|
||||
console.log(`\n--- Step ${step + 1}/${steps} ---`);
|
||||
|
||||
// 1. Perception: Capture screen state
|
||||
console.log("Scanning screen...");
|
||||
const { elements, json: screenContext } = getScreenState();
|
||||
|
||||
// 2. Screen diff: detect stuck loops
|
||||
let diffContext = "";
|
||||
if (step > 0) {
|
||||
const diff = diffScreenState(prevElements, elements);
|
||||
diffContext = `\n\nSCREEN_CHANGE: ${diff.summary}`;
|
||||
|
||||
if (!diff.changed) {
|
||||
stuckCount++;
|
||||
console.log(
|
||||
`Warning: Screen unchanged for ${stuckCount} step(s).`
|
||||
);
|
||||
if (stuckCount >= Config.STUCK_THRESHOLD) {
|
||||
console.log(
|
||||
`Stuck for ${stuckCount} steps. Injecting recovery hint.`
|
||||
);
|
||||
diffContext +=
|
||||
`\nWARNING: You have been stuck for ${stuckCount} steps. ` +
|
||||
`The screen is NOT changing. Try a DIFFERENT action: ` +
|
||||
`swipe to scroll, press back, go home, or launch a different app.`;
|
||||
}
|
||||
} else {
|
||||
stuckCount = 0;
|
||||
}
|
||||
}
|
||||
prevElements = elements;
|
||||
|
||||
// 3. Vision fallback: if accessibility tree is empty, use screenshot
|
||||
let visionContext = "";
|
||||
if (elements.length === 0 && Config.VISION_ENABLED) {
|
||||
console.log("Accessibility tree empty. Attempting vision fallback...");
|
||||
const screenshotPath = captureScreenshot();
|
||||
if (screenshotPath) {
|
||||
visionContext =
|
||||
"\n\nVISION_FALLBACK: The accessibility tree returned NO elements. " +
|
||||
"A screenshot has been captured. The screen likely contains custom-drawn " +
|
||||
"content (game, WebView, or Flutter). Try using coordinate-based taps on " +
|
||||
"common UI positions, or use 'back'/'home' to navigate away. " +
|
||||
"If you know the app package name, use 'launch' to restart it.";
|
||||
console.log("Vision fallback: screenshot captured for context.");
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Reasoning: Get LLM decision
|
||||
console.log("Thinking...");
|
||||
const historyStr = formatActionHistory(actionHistory, resultHistory);
|
||||
const fullContext = screenContext + historyStr + diffContext + visionContext;
|
||||
|
||||
let decision: ActionDecision;
|
||||
try {
|
||||
decision = await llm.getDecision(goal, fullContext, actionHistory);
|
||||
} catch (err) {
|
||||
console.log(`LLM Error: ${(err as Error).message}`);
|
||||
console.log("Falling back to wait action.");
|
||||
decision = { action: "wait", reason: "LLM request failed, waiting for retry" };
|
||||
}
|
||||
|
||||
console.log(`Decision: ${decision.action} — ${decision.reason ?? "no reason"}`);
|
||||
|
||||
// 5. Action: Execute the decision
|
||||
let result: ActionResult;
|
||||
try {
|
||||
result = executeAction(decision);
|
||||
} catch (err) {
|
||||
console.log(`Action Error: ${(err as Error).message}`);
|
||||
result = { success: false, message: (err as Error).message };
|
||||
}
|
||||
|
||||
// Track history
|
||||
actionHistory.push(decision);
|
||||
resultHistory.push(result);
|
||||
|
||||
// 6. Check for goal completion
|
||||
if (decision.action === "done") {
|
||||
console.log("\nTask completed successfully.");
|
||||
return;
|
||||
}
|
||||
|
||||
// Wait for UI to update
|
||||
await Bun.sleep(Config.STEP_DELAY * 1000);
|
||||
}
|
||||
|
||||
console.log("\nMax steps reached. Task may be incomplete.");
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Entry Point
|
||||
// ===========================================
|
||||
|
||||
async function main(): Promise<void> {
|
||||
try {
|
||||
Config.validate();
|
||||
} catch (e) {
|
||||
console.log(`Configuration Error: ${(e as Error).message}`);
|
||||
return;
|
||||
}
|
||||
|
||||
// Read user input from stdin
|
||||
process.stdout.write("Enter your goal: ");
|
||||
const goal = await new Promise<string>((resolve) => {
|
||||
const reader = Bun.stdin.stream().getReader();
|
||||
reader.read().then(({ value }) => {
|
||||
resolve(new TextDecoder().decode(value).trim());
|
||||
reader.releaseLock();
|
||||
});
|
||||
});
|
||||
|
||||
if (!goal) {
|
||||
console.log("No goal provided. Exiting.");
|
||||
return;
|
||||
}
|
||||
|
||||
await runAgent(goal);
|
||||
}
|
||||
|
||||
main();
|
||||
327
android-action-kernel/src/llm-providers.ts
Normal file
327
android-action-kernel/src/llm-providers.ts
Normal file
@@ -0,0 +1,327 @@
|
||||
/**
|
||||
* LLM Provider module for Android Action Kernel.
|
||||
* Supports OpenAI, Groq, AWS Bedrock, and OpenRouter (via Vercel AI SDK).
|
||||
*/
|
||||
|
||||
import OpenAI from "openai";
|
||||
import {
|
||||
BedrockRuntimeClient,
|
||||
InvokeModelCommand,
|
||||
} from "@aws-sdk/client-bedrock-runtime";
|
||||
import { generateText } from "ai";
|
||||
import { createOpenRouter } from "@openrouter/ai-sdk-provider";
|
||||
|
||||
import { Config } from "./config.js";
|
||||
import {
|
||||
GROQ_API_BASE_URL,
|
||||
BEDROCK_ANTHROPIC_MODELS,
|
||||
BEDROCK_META_MODELS,
|
||||
} from "./constants.js";
|
||||
import type { ActionDecision } from "./actions.js";
|
||||
|
||||
// ===========================================
|
||||
// System Prompt — all 15 actions + rich element context
|
||||
// ===========================================
|
||||
|
||||
const SYSTEM_PROMPT = `You are an Android Driver Agent. Your job is to achieve the user's goal by navigating the Android UI.
|
||||
|
||||
You will receive:
|
||||
1. GOAL — the user's task.
|
||||
2. SCREEN_CONTEXT — JSON array of interactive UI elements with coordinates, states, and hierarchy.
|
||||
3. PREVIOUS_ACTIONS — your action history with outcomes (OK/FAILED).
|
||||
4. SCREEN_CHANGE — what changed since your last action (or if the screen is stuck).
|
||||
5. VISION_FALLBACK — present when the accessibility tree is empty (custom UI / WebView).
|
||||
|
||||
You must output ONLY a valid JSON object with your next action.
|
||||
|
||||
═══════════════════════════════════════════
|
||||
AVAILABLE ACTIONS (15 total)
|
||||
═══════════════════════════════════════════
|
||||
|
||||
Navigation:
|
||||
{"action": "tap", "coordinates": [x, y], "reason": "..."}
|
||||
{"action": "longpress", "coordinates": [x, y], "reason": "..."}
|
||||
{"action": "swipe", "direction": "up|down|left|right", "reason": "..."}
|
||||
{"action": "enter", "reason": "Press Enter/submit"}
|
||||
{"action": "back", "reason": "Navigate back"}
|
||||
{"action": "home", "reason": "Go to home screen"}
|
||||
|
||||
Text Input:
|
||||
{"action": "type", "text": "Hello World", "reason": "..."}
|
||||
{"action": "clear", "reason": "Clear current text field before typing"}
|
||||
|
||||
App Control:
|
||||
{"action": "launch", "package": "com.whatsapp", "reason": "Open WhatsApp"}
|
||||
{"action": "launch", "uri": "https://maps.google.com/?q=pizza", "reason": "Open URL"}
|
||||
{"action": "launch", "package": "com.whatsapp", "uri": "content://media/external/images/1", "extras": {"android.intent.extra.TEXT": "Check this"}, "reason": "Share image to WhatsApp"}
|
||||
|
||||
Data:
|
||||
{"action": "screenshot", "reason": "Capture current screen"}
|
||||
{"action": "screenshot", "filename": "order_confirmation.png", "reason": "Save proof"}
|
||||
{"action": "clipboard_get", "reason": "Read clipboard contents"}
|
||||
{"action": "clipboard_set", "text": "copied text", "reason": "Set clipboard"}
|
||||
|
||||
System:
|
||||
{"action": "shell", "command": "am force-stop com.app.broken", "reason": "Kill crashed app"}
|
||||
{"action": "wait", "reason": "Wait for screen to load"}
|
||||
{"action": "done", "reason": "Task is complete"}
|
||||
|
||||
═══════════════════════════════════════════
|
||||
ELEMENT PROPERTIES YOU WILL SEE
|
||||
═══════════════════════════════════════════
|
||||
|
||||
Each element in SCREEN_CONTEXT has:
|
||||
- text: visible label or content description
|
||||
- center: [x, y] coordinates to tap
|
||||
- size: [width, height] in pixels
|
||||
- enabled: whether the element can be interacted with (DO NOT tap disabled elements!)
|
||||
- checked: checkbox/toggle state (true = ON)
|
||||
- focused: whether this field currently has input focus
|
||||
- selected: whether this item is currently selected (tabs, list items)
|
||||
- scrollable: whether this container can be scrolled
|
||||
- longClickable: supports long-press for context menu
|
||||
- editable: text input field
|
||||
- password: password input (don't read/log the text)
|
||||
- hint: placeholder text shown when field is empty
|
||||
- parent: the containing element (helps understand layout hierarchy)
|
||||
- action: suggested action — "tap", "type", "longpress", "scroll", or "read"
|
||||
|
||||
═══════════════════════════════════════════
|
||||
CRITICAL RULES
|
||||
═══════════════════════════════════════════
|
||||
|
||||
1. DISABLED ELEMENTS: If "enabled": false, DO NOT tap or interact with it. Find an alternative.
|
||||
2. TEXT INPUT: If "editable": true, use "clear" first if field has existing text, then "type".
|
||||
3. ALREADY TYPED: Check PREVIOUS_ACTIONS. Do NOT re-type text you already entered.
|
||||
4. REPETITION: Do NOT tap the same coordinates twice in a row. If it didn't work, try something else.
|
||||
5. STUCK: If SCREEN_CHANGE says "NOT changed", your last action had no effect. Change strategy.
|
||||
6. APP LAUNCH: Use "launch" to directly open apps instead of hunting for icons on the home screen.
|
||||
7. SCREENSHOTS: Use "screenshot" to capture proof of completed tasks (order confirmations, etc).
|
||||
8. LONG PRESS: Use "longpress" when you see "longClickable": true (context menus, copy/paste, etc).
|
||||
9. SCROLLING: If the item you need isn't visible, "swipe" up/down to scroll and find it.
|
||||
10. MULTI-APP: To switch apps, use "home" then "launch" the next app. Or use "back" to return.
|
||||
11. PASSWORDS: Never log or output the text of password fields.
|
||||
12. DONE: Say "done" as soon as the goal is achieved. Don't keep acting after success.
|
||||
13. SEARCH: After typing in a search field, use "enter" to submit the search.
|
||||
14. SHARE: To send files/images between apps, use "launch" with uri + extras for Android intents.
|
||||
15. CLEANUP: If a popup/ad appears, dismiss it with "back" or tap the close button, then continue.`;
|
||||
|
||||
// ===========================================
|
||||
// Provider Interface
|
||||
// ===========================================
|
||||
|
||||
interface ActionHistoryEntry {
|
||||
action?: string;
|
||||
reason?: string;
|
||||
text?: string;
|
||||
coordinates?: [number, number];
|
||||
package?: string;
|
||||
uri?: string;
|
||||
}
|
||||
|
||||
export interface LLMProvider {
|
||||
getDecision(
|
||||
goal: string,
|
||||
screenContext: string,
|
||||
actionHistory: ActionHistoryEntry[]
|
||||
): Promise<ActionDecision>;
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// OpenAI / Groq Provider
|
||||
// ===========================================
|
||||
|
||||
class OpenAIProvider implements LLMProvider {
|
||||
private client: OpenAI;
|
||||
private model: string;
|
||||
|
||||
constructor() {
|
||||
if (Config.LLM_PROVIDER === "groq") {
|
||||
this.client = new OpenAI({
|
||||
apiKey: Config.GROQ_API_KEY,
|
||||
baseURL: GROQ_API_BASE_URL,
|
||||
});
|
||||
this.model = Config.GROQ_MODEL;
|
||||
} else {
|
||||
this.client = new OpenAI({ apiKey: Config.OPENAI_API_KEY });
|
||||
this.model = Config.OPENAI_MODEL;
|
||||
}
|
||||
}
|
||||
|
||||
async getDecision(
|
||||
goal: string,
|
||||
screenContext: string,
|
||||
_actionHistory: ActionHistoryEntry[]
|
||||
): Promise<ActionDecision> {
|
||||
// screenContext now includes history, diff, and vision context from kernel
|
||||
const userContent = `GOAL: ${goal}\n\nSCREEN_CONTEXT:\n${screenContext}`;
|
||||
|
||||
const response = await this.client.chat.completions.create({
|
||||
model: this.model,
|
||||
response_format: { type: "json_object" },
|
||||
messages: [
|
||||
{ role: "system", content: SYSTEM_PROMPT },
|
||||
{ role: "user", content: userContent },
|
||||
],
|
||||
});
|
||||
|
||||
return JSON.parse(response.choices[0].message.content ?? "{}");
|
||||
}
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// OpenRouter Provider (Vercel AI SDK)
|
||||
// ===========================================
|
||||
|
||||
class OpenRouterProvider implements LLMProvider {
|
||||
private openrouter: ReturnType<typeof createOpenRouter>;
|
||||
private model: string;
|
||||
|
||||
constructor() {
|
||||
this.openrouter = createOpenRouter({
|
||||
apiKey: Config.OPENROUTER_API_KEY,
|
||||
});
|
||||
this.model = Config.OPENROUTER_MODEL;
|
||||
}
|
||||
|
||||
async getDecision(
|
||||
goal: string,
|
||||
screenContext: string,
|
||||
_actionHistory: ActionHistoryEntry[]
|
||||
): Promise<ActionDecision> {
|
||||
const userContent = `GOAL: ${goal}\n\nSCREEN_CONTEXT:\n${screenContext}`;
|
||||
|
||||
const result = await generateText({
|
||||
model: this.openrouter.chat(this.model),
|
||||
system: SYSTEM_PROMPT,
|
||||
prompt: userContent + "\n\nRespond with ONLY a valid JSON object.",
|
||||
});
|
||||
|
||||
return parseJsonResponse(result.text);
|
||||
}
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// AWS Bedrock Provider
|
||||
// ===========================================
|
||||
|
||||
class BedrockProvider implements LLMProvider {
|
||||
private client: BedrockRuntimeClient;
|
||||
private model: string;
|
||||
|
||||
constructor() {
|
||||
this.client = new BedrockRuntimeClient({ region: Config.AWS_REGION });
|
||||
this.model = Config.BEDROCK_MODEL;
|
||||
}
|
||||
|
||||
async getDecision(
|
||||
goal: string,
|
||||
screenContext: string,
|
||||
_actionHistory: ActionHistoryEntry[]
|
||||
): Promise<ActionDecision> {
|
||||
const userContent = `GOAL: ${goal}\n\nSCREEN_CONTEXT:\n${screenContext}`;
|
||||
const requestBody = this.buildRequest(userContent);
|
||||
|
||||
const command = new InvokeModelCommand({
|
||||
modelId: this.model,
|
||||
body: new TextEncoder().encode(requestBody),
|
||||
contentType: "application/json",
|
||||
accept: "application/json",
|
||||
});
|
||||
|
||||
const response = await this.client.send(command);
|
||||
const responseBody = JSON.parse(new TextDecoder().decode(response.body));
|
||||
const resultText = this.extractResponse(responseBody);
|
||||
|
||||
return parseJsonResponse(resultText);
|
||||
}
|
||||
|
||||
private isAnthropicModel(): boolean {
|
||||
return BEDROCK_ANTHROPIC_MODELS.some((id) => this.model.includes(id));
|
||||
}
|
||||
|
||||
private isMetaModel(): boolean {
|
||||
return BEDROCK_META_MODELS.some((id) =>
|
||||
this.model.toLowerCase().includes(id)
|
||||
);
|
||||
}
|
||||
|
||||
private buildRequest(userContent: string): string {
|
||||
if (this.isAnthropicModel()) {
|
||||
return JSON.stringify({
|
||||
anthropic_version: "bedrock-2023-05-31",
|
||||
max_tokens: 1024,
|
||||
system: SYSTEM_PROMPT,
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content:
|
||||
userContent + "\n\nRespond with ONLY a valid JSON object.",
|
||||
},
|
||||
],
|
||||
});
|
||||
}
|
||||
|
||||
if (this.isMetaModel()) {
|
||||
return JSON.stringify({
|
||||
prompt: `<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n${SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n${userContent}\n\nRespond with ONLY a valid JSON object, no other text.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n`,
|
||||
max_gen_len: 512,
|
||||
temperature: 0.1,
|
||||
});
|
||||
}
|
||||
|
||||
return JSON.stringify({
|
||||
inputText: `${SYSTEM_PROMPT}\n\n${userContent}\n\nRespond with ONLY a valid JSON object.`,
|
||||
textGenerationConfig: {
|
||||
maxTokenCount: 512,
|
||||
temperature: 0.1,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
private extractResponse(responseBody: Record<string, any>): string {
|
||||
if (this.isAnthropicModel()) {
|
||||
return responseBody.content[0].text;
|
||||
}
|
||||
if (this.isMetaModel()) {
|
||||
return responseBody.generation ?? "";
|
||||
}
|
||||
return responseBody.results[0].outputText;
|
||||
}
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Shared JSON Parsing
|
||||
// ===========================================
|
||||
|
||||
function parseJsonResponse(text: string): ActionDecision {
|
||||
try {
|
||||
return JSON.parse(text);
|
||||
} catch {
|
||||
// Try to extract JSON from markdown code blocks or mixed text
|
||||
const match = text.match(/\{[\s\S]*?\}/);
|
||||
if (match) {
|
||||
try {
|
||||
return JSON.parse(match[0]);
|
||||
} catch {
|
||||
// fall through
|
||||
}
|
||||
}
|
||||
console.log(`Warning: Could not parse LLM response: ${text.slice(0, 200)}`);
|
||||
return { action: "wait", reason: "Failed to parse response, waiting" };
|
||||
}
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Factory
|
||||
// ===========================================
|
||||
|
||||
export function getLlmProvider(): LLMProvider {
|
||||
if (Config.LLM_PROVIDER === "bedrock") {
|
||||
return new BedrockProvider();
|
||||
}
|
||||
if (Config.LLM_PROVIDER === "openrouter") {
|
||||
return new OpenRouterProvider();
|
||||
}
|
||||
return new OpenAIProvider();
|
||||
}
|
||||
171
android-action-kernel/src/sanitizer.ts
Normal file
171
android-action-kernel/src/sanitizer.ts
Normal file
@@ -0,0 +1,171 @@
|
||||
/**
|
||||
* XML Sanitizer for Android Action Kernel.
|
||||
* Parses Android Accessibility XML and extracts interactive UI elements
|
||||
* with full state information and parent-child hierarchy context.
|
||||
*/
|
||||
|
||||
import { XMLParser } from "fast-xml-parser";
|
||||
|
||||
export interface UIElement {
|
||||
id: string;
|
||||
text: string;
|
||||
type: string;
|
||||
bounds: string;
|
||||
center: [number, number];
|
||||
size: [number, number];
|
||||
clickable: boolean;
|
||||
editable: boolean;
|
||||
enabled: boolean;
|
||||
checked: boolean;
|
||||
focused: boolean;
|
||||
selected: boolean;
|
||||
scrollable: boolean;
|
||||
longClickable: boolean;
|
||||
password: boolean;
|
||||
hint: string;
|
||||
action: "tap" | "type" | "longpress" | "scroll" | "read";
|
||||
parent: string;
|
||||
depth: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute a hash of element texts/ids for screen state comparison.
|
||||
*/
|
||||
export function computeScreenHash(elements: UIElement[]): string {
|
||||
const parts = elements.map(
|
||||
(e) => `${e.id}|${e.text}|${e.center[0]},${e.center[1]}|${e.enabled}|${e.checked}`
|
||||
);
|
||||
return parts.join(";");
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses Android Accessibility XML and returns a rich list of interactive elements.
|
||||
* Preserves state (enabled, checked, focused) and hierarchy context.
|
||||
*/
|
||||
export function getInteractiveElements(xmlContent: string): UIElement[] {
|
||||
const parser = new XMLParser({
|
||||
ignoreAttributes: false,
|
||||
attributeNamePrefix: "@_",
|
||||
allowBooleanAttributes: true,
|
||||
});
|
||||
|
||||
let parsed: unknown;
|
||||
try {
|
||||
parsed = parser.parse(xmlContent);
|
||||
} catch {
|
||||
console.log("Warning: Error parsing XML. The screen might be loading.");
|
||||
return [];
|
||||
}
|
||||
|
||||
const elements: UIElement[] = [];
|
||||
|
||||
function walk(node: any, parentLabel: string, depth: number): void {
|
||||
if (!node || typeof node !== "object") return;
|
||||
|
||||
if (node["@_bounds"]) {
|
||||
const isClickable = node["@_clickable"] === "true";
|
||||
const isLongClickable = node["@_long-clickable"] === "true";
|
||||
const isScrollable = node["@_scrollable"] === "true";
|
||||
const isEnabled = node["@_enabled"] !== "false"; // default true
|
||||
const isChecked = node["@_checked"] === "true";
|
||||
const isFocused = node["@_focused"] === "true";
|
||||
const isSelected = node["@_selected"] === "true";
|
||||
const isPassword = node["@_password"] === "true";
|
||||
|
||||
const elementClass = node["@_class"] ?? "";
|
||||
const isEditable =
|
||||
elementClass.includes("EditText") ||
|
||||
elementClass.includes("AutoCompleteTextView") ||
|
||||
node["@_editable"] === "true";
|
||||
|
||||
const text: string = node["@_text"] ?? "";
|
||||
const desc: string = node["@_content-desc"] ?? "";
|
||||
const resourceId: string = node["@_resource-id"] ?? "";
|
||||
const hint: string = node["@_hint"] ?? "";
|
||||
|
||||
// Build a label for this node to use as parent context for children
|
||||
const typeName = elementClass.split(".").pop() ?? "";
|
||||
const nodeLabel = text || desc || resourceId.split("/").pop() || typeName;
|
||||
|
||||
// Determine if this element should be included
|
||||
const isInteractive = isClickable || isEditable || isLongClickable || isScrollable;
|
||||
const hasContent = !!(text || desc);
|
||||
|
||||
if (isInteractive || hasContent) {
|
||||
const bounds: string = node["@_bounds"];
|
||||
try {
|
||||
const coords = bounds
|
||||
.replace("][", ",")
|
||||
.replace("[", "")
|
||||
.replace("]", "")
|
||||
.split(",")
|
||||
.map(Number);
|
||||
|
||||
const [x1, y1, x2, y2] = coords;
|
||||
const centerX = Math.floor((x1 + x2) / 2);
|
||||
const centerY = Math.floor((y1 + y2) / 2);
|
||||
const width = x2 - x1;
|
||||
const height = y2 - y1;
|
||||
|
||||
// Skip zero-size elements (invisible)
|
||||
if (width <= 0 || height <= 0) {
|
||||
// still walk children
|
||||
} else {
|
||||
let suggestedAction: UIElement["action"];
|
||||
if (isEditable) suggestedAction = "type";
|
||||
else if (isLongClickable && !isClickable) suggestedAction = "longpress";
|
||||
else if (isScrollable && !isClickable) suggestedAction = "scroll";
|
||||
else if (isClickable) suggestedAction = "tap";
|
||||
else suggestedAction = "read";
|
||||
|
||||
elements.push({
|
||||
id: resourceId,
|
||||
text: text || desc,
|
||||
type: typeName,
|
||||
bounds,
|
||||
center: [centerX, centerY],
|
||||
size: [width, height],
|
||||
clickable: isClickable,
|
||||
editable: isEditable,
|
||||
enabled: isEnabled,
|
||||
checked: isChecked,
|
||||
focused: isFocused,
|
||||
selected: isSelected,
|
||||
scrollable: isScrollable,
|
||||
longClickable: isLongClickable,
|
||||
password: isPassword,
|
||||
hint: hint,
|
||||
action: suggestedAction,
|
||||
parent: parentLabel,
|
||||
depth,
|
||||
});
|
||||
}
|
||||
} catch {
|
||||
// Skip malformed bounds
|
||||
}
|
||||
}
|
||||
|
||||
// Recurse with updated parent label
|
||||
walkChildren(node, nodeLabel, depth + 1);
|
||||
return;
|
||||
}
|
||||
|
||||
// No bounds on this node — just recurse
|
||||
walkChildren(node, parentLabel, depth);
|
||||
}
|
||||
|
||||
function walkChildren(node: any, parentLabel: string, depth: number): void {
|
||||
if (node.node) {
|
||||
const children = Array.isArray(node.node) ? node.node : [node.node];
|
||||
for (const child of children) {
|
||||
walk(child, parentLabel, depth);
|
||||
}
|
||||
}
|
||||
if (node.hierarchy) {
|
||||
walk(node.hierarchy, parentLabel, depth);
|
||||
}
|
||||
}
|
||||
|
||||
walk(parsed, "root", 0);
|
||||
return elements;
|
||||
}
|
||||
19
android-action-kernel/tsconfig.json
Normal file
19
android-action-kernel/tsconfig.json
Normal file
@@ -0,0 +1,19 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "ES2022",
|
||||
"module": "ES2022",
|
||||
"moduleResolution": "bundler",
|
||||
"strict": true,
|
||||
"esModuleInterop": true,
|
||||
"skipLibCheck": true,
|
||||
"outDir": "dist",
|
||||
"rootDir": "src",
|
||||
"types": ["bun-types"],
|
||||
"resolveJsonModule": true,
|
||||
"declaration": true,
|
||||
"declarationMap": true,
|
||||
"sourceMap": true
|
||||
},
|
||||
"include": ["src/**/*.ts"],
|
||||
"exclude": ["node_modules", "dist"]
|
||||
}
|
||||
Reference in New Issue
Block a user