initial commit
This commit is contained in:
BIN
android-action-kernel/.DS_Store
vendored
Normal file
BIN
android-action-kernel/.DS_Store
vendored
Normal file
Binary file not shown.
63
android-action-kernel/.env.example
Normal file
63
android-action-kernel/.env.example
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
# Android Action Kernel Configuration (TypeScript/Bun)
|
||||||
|
# Copy this file to .env and fill in your settings
|
||||||
|
# cp .env.example .env
|
||||||
|
|
||||||
|
# ===========================================
|
||||||
|
# Agent Configuration
|
||||||
|
# ===========================================
|
||||||
|
MAX_STEPS=30 # Maximum steps before stopping (30 for complex multi-app tasks)
|
||||||
|
STEP_DELAY=2 # Seconds to wait between steps
|
||||||
|
MAX_RETRIES=3 # Retries on ADB/network failures
|
||||||
|
STUCK_THRESHOLD=3 # Steps before stuck-loop recovery kicks in
|
||||||
|
|
||||||
|
# ===========================================
|
||||||
|
# Vision Fallback (when accessibility tree is empty)
|
||||||
|
# ===========================================
|
||||||
|
VISION_ENABLED=true # Auto-capture screenshot when UI elements not found
|
||||||
|
|
||||||
|
# ===========================================
|
||||||
|
# LLM Provider: "groq", "openai", "bedrock", or "openrouter"
|
||||||
|
# ===========================================
|
||||||
|
LLM_PROVIDER=groq
|
||||||
|
|
||||||
|
# ===========================================
|
||||||
|
# Groq Configuration (Free tier available)
|
||||||
|
# Get your key at: https://console.groq.com
|
||||||
|
# ===========================================
|
||||||
|
GROQ_API_KEY=gsk_your_key_here
|
||||||
|
GROQ_MODEL=llama-3.3-70b-versatile
|
||||||
|
# Other models: llama-3.1-8b-instant (faster, higher rate limits)
|
||||||
|
|
||||||
|
# ===========================================
|
||||||
|
# OpenAI Configuration
|
||||||
|
# Get your key at: https://platform.openai.com
|
||||||
|
# ===========================================
|
||||||
|
OPENAI_API_KEY=sk-your_key_here
|
||||||
|
OPENAI_MODEL=gpt-4o
|
||||||
|
# Other models: gpt-4o-mini (faster, cheaper)
|
||||||
|
|
||||||
|
# ===========================================
|
||||||
|
# AWS Bedrock Configuration
|
||||||
|
# Uses AWS credential chain (run 'aws configure' first)
|
||||||
|
# ===========================================
|
||||||
|
AWS_REGION=us-east-1
|
||||||
|
BEDROCK_MODEL=us.meta.llama3-3-70b-instruct-v1:0
|
||||||
|
# Other models:
|
||||||
|
# anthropic.claude-3-sonnet-20240229-v1:0
|
||||||
|
# anthropic.claude-3-haiku-20240307-v1:0
|
||||||
|
# meta.llama3-8b-instruct-v1:0
|
||||||
|
|
||||||
|
# ===========================================
|
||||||
|
# OpenRouter Configuration (via Vercel AI SDK)
|
||||||
|
# Access 200+ models through a single API
|
||||||
|
# Get your key at: https://openrouter.ai/keys
|
||||||
|
# ===========================================
|
||||||
|
OPENROUTER_API_KEY=sk-or-v1-your_key_here
|
||||||
|
OPENROUTER_MODEL=anthropic/claude-3.5-sonnet
|
||||||
|
# Popular models:
|
||||||
|
# anthropic/claude-3.5-sonnet (best reasoning)
|
||||||
|
# openai/gpt-4o (multimodal)
|
||||||
|
# google/gemini-2.0-flash-001 (fast + cheap)
|
||||||
|
# meta-llama/llama-3.3-70b-instruct (open source)
|
||||||
|
# mistralai/mistral-large-latest (European)
|
||||||
|
# deepseek/deepseek-chat (cost efficient)
|
||||||
4
android-action-kernel/.gitignore
vendored
Normal file
4
android-action-kernel/.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
node_modules/
|
||||||
|
dist/
|
||||||
|
bun.lock
|
||||||
|
.env
|
||||||
22
android-action-kernel/package.json
Normal file
22
android-action-kernel/package.json
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
{
|
||||||
|
"name": "android-action-kernel",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "AI agent that controls Android devices through the accessibility API - TypeScript/Bun edition",
|
||||||
|
"type": "module",
|
||||||
|
"scripts": {
|
||||||
|
"start": "bun run src/kernel.ts",
|
||||||
|
"build": "bun build src/kernel.ts --outdir dist --target bun",
|
||||||
|
"typecheck": "tsc --noEmit"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"@aws-sdk/client-bedrock-runtime": "^3.700.0",
|
||||||
|
"@openrouter/ai-sdk-provider": "^2.1.1",
|
||||||
|
"ai": "^6.0.72",
|
||||||
|
"fast-xml-parser": "^4.5.0",
|
||||||
|
"openai": "^4.73.0"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@types/bun": "^1.1.0",
|
||||||
|
"typescript": "^5.6.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
322
android-action-kernel/src/actions.ts
Normal file
322
android-action-kernel/src/actions.ts
Normal file
@@ -0,0 +1,322 @@
|
|||||||
|
/**
|
||||||
|
* Action execution module for Android Action Kernel.
|
||||||
|
* Handles all ADB commands for interacting with Android devices.
|
||||||
|
*
|
||||||
|
* Supported actions:
|
||||||
|
* tap, type, enter, swipe, home, back, wait, done,
|
||||||
|
* longpress, screenshot, launch, clear, clipboard_get, clipboard_set, shell
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Config } from "./config.js";
|
||||||
|
import {
|
||||||
|
KEYCODE_ENTER,
|
||||||
|
KEYCODE_HOME,
|
||||||
|
KEYCODE_BACK,
|
||||||
|
KEYCODE_DEL,
|
||||||
|
KEYCODE_MOVE_HOME,
|
||||||
|
KEYCODE_MOVE_END,
|
||||||
|
SWIPE_COORDS,
|
||||||
|
SWIPE_DURATION_MS,
|
||||||
|
LONG_PRESS_DURATION_MS,
|
||||||
|
DEVICE_SCREENSHOT_PATH,
|
||||||
|
LOCAL_SCREENSHOT_PATH,
|
||||||
|
} from "./constants.js";
|
||||||
|
|
||||||
|
export interface ActionDecision {
|
||||||
|
action: string;
|
||||||
|
coordinates?: [number, number];
|
||||||
|
text?: string;
|
||||||
|
direction?: string;
|
||||||
|
reason?: string;
|
||||||
|
// launch action
|
||||||
|
package?: string;
|
||||||
|
activity?: string;
|
||||||
|
uri?: string;
|
||||||
|
extras?: Record<string, string>;
|
||||||
|
// shell action
|
||||||
|
command?: string;
|
||||||
|
// screenshot action
|
||||||
|
filename?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ActionResult {
|
||||||
|
success: boolean;
|
||||||
|
message: string;
|
||||||
|
data?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Executes a shell command via ADB with retry support.
|
||||||
|
*/
|
||||||
|
export function runAdbCommand(command: string[], retries = Config.MAX_RETRIES): string {
|
||||||
|
for (let attempt = 0; attempt <= retries; attempt++) {
|
||||||
|
const result = Bun.spawnSync([Config.ADB_PATH, ...command], {
|
||||||
|
stdout: "pipe",
|
||||||
|
stderr: "pipe",
|
||||||
|
});
|
||||||
|
|
||||||
|
const stdout = result.stdout.toString().trim();
|
||||||
|
const stderr = result.stderr.toString().trim();
|
||||||
|
|
||||||
|
if (stderr && stderr.toLowerCase().includes("error")) {
|
||||||
|
if (attempt < retries) {
|
||||||
|
const delay = Math.pow(2, attempt) * 1000;
|
||||||
|
console.log(`ADB Error (attempt ${attempt + 1}/${retries + 1}): ${stderr}`);
|
||||||
|
console.log(`Retrying in ${delay / 1000}s...`);
|
||||||
|
Bun.sleepSync(delay);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
console.log(`ADB Error (all retries exhausted): ${stderr}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return stdout;
|
||||||
|
}
|
||||||
|
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Executes the action decided by the LLM. Returns a result for the kernel to track.
|
||||||
|
*/
|
||||||
|
export function executeAction(action: ActionDecision): ActionResult {
|
||||||
|
switch (action.action) {
|
||||||
|
case "tap":
|
||||||
|
return executeTap(action);
|
||||||
|
case "type":
|
||||||
|
return executeType(action);
|
||||||
|
case "enter":
|
||||||
|
return executeEnter();
|
||||||
|
case "swipe":
|
||||||
|
return executeSwipe(action);
|
||||||
|
case "home":
|
||||||
|
return executeHome();
|
||||||
|
case "back":
|
||||||
|
return executeBack();
|
||||||
|
case "wait":
|
||||||
|
return executeWait();
|
||||||
|
case "done":
|
||||||
|
return executeDone(action);
|
||||||
|
case "longpress":
|
||||||
|
return executeLongPress(action);
|
||||||
|
case "screenshot":
|
||||||
|
return executeScreenshot(action);
|
||||||
|
case "launch":
|
||||||
|
return executeLaunch(action);
|
||||||
|
case "clear":
|
||||||
|
return executeClear();
|
||||||
|
case "clipboard_get":
|
||||||
|
return executeClipboardGet();
|
||||||
|
case "clipboard_set":
|
||||||
|
return executeClipboardSet(action);
|
||||||
|
case "shell":
|
||||||
|
return executeShell(action);
|
||||||
|
default:
|
||||||
|
console.log(`Warning: Unknown action: ${action.action}`);
|
||||||
|
return { success: false, message: `Unknown action: ${action.action}` };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// Original actions (enhanced)
|
||||||
|
// ===========================================
|
||||||
|
|
||||||
|
function executeTap(action: ActionDecision): ActionResult {
|
||||||
|
const [x, y] = action.coordinates ?? [0, 0];
|
||||||
|
console.log(`Tapping: (${x}, ${y})`);
|
||||||
|
runAdbCommand(["shell", "input", "tap", String(x), String(y)]);
|
||||||
|
return { success: true, message: `Tapped (${x}, ${y})` };
|
||||||
|
}
|
||||||
|
|
||||||
|
function executeType(action: ActionDecision): ActionResult {
|
||||||
|
const text = action.text ?? "";
|
||||||
|
if (!text) return { success: false, message: "No text to type" };
|
||||||
|
// ADB requires %s for spaces, escape special shell characters
|
||||||
|
const escapedText = text
|
||||||
|
.replaceAll("\\", "\\\\")
|
||||||
|
.replaceAll("\"", "\\\"")
|
||||||
|
.replaceAll("'", "\\'")
|
||||||
|
.replaceAll(" ", "%s")
|
||||||
|
.replaceAll("&", "\\&")
|
||||||
|
.replaceAll("|", "\\|")
|
||||||
|
.replaceAll(";", "\\;")
|
||||||
|
.replaceAll("(", "\\(")
|
||||||
|
.replaceAll(")", "\\)")
|
||||||
|
.replaceAll("<", "\\<")
|
||||||
|
.replaceAll(">", "\\>");
|
||||||
|
console.log(`Typing: ${text}`);
|
||||||
|
runAdbCommand(["shell", "input", "text", escapedText]);
|
||||||
|
return { success: true, message: `Typed "${text}"` };
|
||||||
|
}
|
||||||
|
|
||||||
|
function executeEnter(): ActionResult {
|
||||||
|
console.log("Pressing Enter");
|
||||||
|
runAdbCommand(["shell", "input", "keyevent", KEYCODE_ENTER]);
|
||||||
|
return { success: true, message: "Pressed Enter" };
|
||||||
|
}
|
||||||
|
|
||||||
|
function executeSwipe(action: ActionDecision): ActionResult {
|
||||||
|
const direction = action.direction ?? "up";
|
||||||
|
const coords = SWIPE_COORDS[direction] ?? SWIPE_COORDS["up"];
|
||||||
|
|
||||||
|
console.log(`Swiping ${direction}`);
|
||||||
|
runAdbCommand([
|
||||||
|
"shell", "input", "swipe",
|
||||||
|
String(coords[0]), String(coords[1]),
|
||||||
|
String(coords[2]), String(coords[3]),
|
||||||
|
SWIPE_DURATION_MS,
|
||||||
|
]);
|
||||||
|
return { success: true, message: `Swiped ${direction}` };
|
||||||
|
}
|
||||||
|
|
||||||
|
function executeHome(): ActionResult {
|
||||||
|
console.log("Going Home");
|
||||||
|
runAdbCommand(["shell", "input", "keyevent", KEYCODE_HOME]);
|
||||||
|
return { success: true, message: "Went to home screen" };
|
||||||
|
}
|
||||||
|
|
||||||
|
function executeBack(): ActionResult {
|
||||||
|
console.log("Going Back");
|
||||||
|
runAdbCommand(["shell", "input", "keyevent", KEYCODE_BACK]);
|
||||||
|
return { success: true, message: "Went back" };
|
||||||
|
}
|
||||||
|
|
||||||
|
function executeWait(): ActionResult {
|
||||||
|
console.log("Waiting...");
|
||||||
|
Bun.sleepSync(2000);
|
||||||
|
return { success: true, message: "Waited 2s" };
|
||||||
|
}
|
||||||
|
|
||||||
|
function executeDone(action: ActionDecision): ActionResult {
|
||||||
|
console.log(`Goal Achieved: ${action.reason ?? "Task complete"}`);
|
||||||
|
return { success: true, message: "done" };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// New actions
|
||||||
|
// ===========================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Long press at coordinates (opens context menus, triggers drag mode, etc.)
|
||||||
|
*/
|
||||||
|
function executeLongPress(action: ActionDecision): ActionResult {
|
||||||
|
const [x, y] = action.coordinates ?? [0, 0];
|
||||||
|
console.log(`Long pressing: (${x}, ${y})`);
|
||||||
|
// A swipe from the same point to the same point with long duration = long press
|
||||||
|
runAdbCommand([
|
||||||
|
"shell", "input", "swipe",
|
||||||
|
String(x), String(y), String(x), String(y),
|
||||||
|
LONG_PRESS_DURATION_MS,
|
||||||
|
]);
|
||||||
|
return { success: true, message: `Long pressed (${x}, ${y})` };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Captures a screenshot and saves it locally.
|
||||||
|
*/
|
||||||
|
function executeScreenshot(action: ActionDecision): ActionResult {
|
||||||
|
const filename = action.filename ?? LOCAL_SCREENSHOT_PATH;
|
||||||
|
console.log(`Taking screenshot → ${filename}`);
|
||||||
|
runAdbCommand(["shell", "screencap", "-p", DEVICE_SCREENSHOT_PATH]);
|
||||||
|
runAdbCommand(["pull", DEVICE_SCREENSHOT_PATH, filename]);
|
||||||
|
return { success: true, message: `Screenshot saved to ${filename}`, data: filename };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Launches an app by package name, activity, or URI intent.
|
||||||
|
*
|
||||||
|
* Examples the LLM can produce:
|
||||||
|
* { action: "launch", package: "com.whatsapp" }
|
||||||
|
* { action: "launch", package: "com.whatsapp", activity: ".HomeActivity" }
|
||||||
|
* { action: "launch", uri: "https://maps.google.com/?q=pizza+near+me" }
|
||||||
|
* { action: "launch", package: "com.whatsapp", uri: "content://media/external/images/1",
|
||||||
|
* extras: { "android.intent.extra.TEXT": "Check this out" } }
|
||||||
|
*/
|
||||||
|
function executeLaunch(action: ActionDecision): ActionResult {
|
||||||
|
const args: string[] = ["shell", "am", "start"];
|
||||||
|
|
||||||
|
if (action.uri) {
|
||||||
|
args.push("-a", "android.intent.action.VIEW");
|
||||||
|
args.push("-d", action.uri);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (action.package && action.activity) {
|
||||||
|
args.push("-n", `${action.package}/${action.activity}`);
|
||||||
|
} else if (action.package) {
|
||||||
|
// Launch the default activity for the package
|
||||||
|
const launchResult = runAdbCommand([
|
||||||
|
"shell", "monkey", "-p", action.package, "-c",
|
||||||
|
"android.intent.category.LAUNCHER", "1",
|
||||||
|
]);
|
||||||
|
console.log(`Launching: ${action.package}`);
|
||||||
|
return { success: true, message: `Launched ${action.package}`, data: launchResult };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Attach intent extras
|
||||||
|
if (action.extras) {
|
||||||
|
for (const [key, value] of Object.entries(action.extras)) {
|
||||||
|
args.push("--es", key, value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const label = action.package ?? action.uri ?? "intent";
|
||||||
|
console.log(`Launching: ${label}`);
|
||||||
|
const result = runAdbCommand(args);
|
||||||
|
return { success: true, message: `Launched ${label}`, data: result };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clears the currently focused text field.
|
||||||
|
* Selects all text then deletes it.
|
||||||
|
*/
|
||||||
|
function executeClear(): ActionResult {
|
||||||
|
console.log("Clearing text field");
|
||||||
|
// Move to end of field
|
||||||
|
runAdbCommand(["shell", "input", "keyevent", KEYCODE_MOVE_END]);
|
||||||
|
// Select all: Shift+Home
|
||||||
|
runAdbCommand(["shell", "input", "keyevent", "--longpress", KEYCODE_MOVE_HOME]);
|
||||||
|
// Delete selected text
|
||||||
|
runAdbCommand(["shell", "input", "keyevent", KEYCODE_DEL]);
|
||||||
|
return { success: true, message: "Cleared text field" };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads the current clipboard contents.
|
||||||
|
*/
|
||||||
|
function executeClipboardGet(): ActionResult {
|
||||||
|
console.log("Reading clipboard");
|
||||||
|
// Use am broadcast to get clipboard via a helper or service log
|
||||||
|
// On Android 10+, direct clipboard access via ADB is restricted.
|
||||||
|
// Workaround: dump the clipboard service log
|
||||||
|
const result = runAdbCommand(["shell", "cmd", "clipboard", "get-text"]);
|
||||||
|
if (result) {
|
||||||
|
console.log(`Clipboard: ${result.slice(0, 100)}`);
|
||||||
|
return { success: true, message: `Clipboard: ${result}`, data: result };
|
||||||
|
}
|
||||||
|
// Fallback for older Android versions
|
||||||
|
const fallback = runAdbCommand([
|
||||||
|
"shell", "service", "call", "clipboard", "2", "i32", "1",
|
||||||
|
]);
|
||||||
|
return { success: true, message: `Clipboard (raw): ${fallback}`, data: fallback };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the clipboard to the given text.
|
||||||
|
*/
|
||||||
|
function executeClipboardSet(action: ActionDecision): ActionResult {
|
||||||
|
const text = action.text ?? "";
|
||||||
|
if (!text) return { success: false, message: "No text to set on clipboard" };
|
||||||
|
console.log(`Setting clipboard: ${text.slice(0, 50)}...`);
|
||||||
|
runAdbCommand(["shell", "cmd", "clipboard", "set-text", text]);
|
||||||
|
return { success: true, message: `Clipboard set to "${text.slice(0, 50)}"` };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs an arbitrary ADB shell command. Use sparingly for edge cases.
|
||||||
|
*/
|
||||||
|
function executeShell(action: ActionDecision): ActionResult {
|
||||||
|
const cmd = action.command ?? "";
|
||||||
|
if (!cmd) return { success: false, message: "No command provided" };
|
||||||
|
console.log(`Shell: ${cmd}`);
|
||||||
|
const result = runAdbCommand(["shell", ...cmd.split(" ")]);
|
||||||
|
return { success: true, message: `Shell output: ${result.slice(0, 200)}`, data: result };
|
||||||
|
}
|
||||||
82
android-action-kernel/src/config.ts
Normal file
82
android-action-kernel/src/config.ts
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
/**
|
||||||
|
* Configuration management for Android Action Kernel.
|
||||||
|
* Bun natively loads .env files — no dotenv needed.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import {
|
||||||
|
DEVICE_DUMP_PATH,
|
||||||
|
LOCAL_DUMP_PATH,
|
||||||
|
DEVICE_SCREENSHOT_PATH,
|
||||||
|
LOCAL_SCREENSHOT_PATH,
|
||||||
|
DEFAULT_MAX_STEPS,
|
||||||
|
DEFAULT_STEP_DELAY,
|
||||||
|
DEFAULT_GROQ_MODEL,
|
||||||
|
DEFAULT_OPENAI_MODEL,
|
||||||
|
DEFAULT_BEDROCK_MODEL,
|
||||||
|
DEFAULT_MAX_RETRIES,
|
||||||
|
DEFAULT_STUCK_THRESHOLD,
|
||||||
|
DEFAULT_VISION_ENABLED,
|
||||||
|
} from "./constants.js";
|
||||||
|
|
||||||
|
function env(key: string, fallback = ""): string {
|
||||||
|
return process.env[key] ?? fallback;
|
||||||
|
}
|
||||||
|
|
||||||
|
export const Config = {
|
||||||
|
// ADB Configuration
|
||||||
|
ADB_PATH: env("ADB_PATH", "adb"),
|
||||||
|
SCREEN_DUMP_PATH: DEVICE_DUMP_PATH,
|
||||||
|
LOCAL_DUMP_PATH: LOCAL_DUMP_PATH,
|
||||||
|
DEVICE_SCREENSHOT_PATH: DEVICE_SCREENSHOT_PATH,
|
||||||
|
LOCAL_SCREENSHOT_PATH: LOCAL_SCREENSHOT_PATH,
|
||||||
|
|
||||||
|
// Agent Configuration
|
||||||
|
MAX_STEPS: parseInt(env("MAX_STEPS", String(DEFAULT_MAX_STEPS)), 10),
|
||||||
|
STEP_DELAY: parseFloat(env("STEP_DELAY", String(DEFAULT_STEP_DELAY))),
|
||||||
|
MAX_RETRIES: parseInt(env("MAX_RETRIES", String(DEFAULT_MAX_RETRIES)), 10),
|
||||||
|
STUCK_THRESHOLD: parseInt(env("STUCK_THRESHOLD", String(DEFAULT_STUCK_THRESHOLD)), 10),
|
||||||
|
|
||||||
|
// Vision fallback (when accessibility tree is empty)
|
||||||
|
VISION_ENABLED: env("VISION_ENABLED", String(DEFAULT_VISION_ENABLED)) === "true",
|
||||||
|
|
||||||
|
// LLM Provider: "groq", "openai", "bedrock", or "openrouter"
|
||||||
|
LLM_PROVIDER: env("LLM_PROVIDER", "groq"),
|
||||||
|
|
||||||
|
// Groq Configuration
|
||||||
|
GROQ_API_KEY: env("GROQ_API_KEY"),
|
||||||
|
GROQ_MODEL: env("GROQ_MODEL", DEFAULT_GROQ_MODEL),
|
||||||
|
|
||||||
|
// OpenAI Configuration
|
||||||
|
OPENAI_API_KEY: env("OPENAI_API_KEY"),
|
||||||
|
OPENAI_MODEL: env("OPENAI_MODEL", DEFAULT_OPENAI_MODEL),
|
||||||
|
|
||||||
|
// AWS Bedrock Configuration
|
||||||
|
AWS_REGION: env("AWS_REGION", "us-east-1"),
|
||||||
|
BEDROCK_MODEL: env("BEDROCK_MODEL", DEFAULT_BEDROCK_MODEL),
|
||||||
|
|
||||||
|
// OpenRouter Configuration (via Vercel AI SDK)
|
||||||
|
OPENROUTER_API_KEY: env("OPENROUTER_API_KEY"),
|
||||||
|
OPENROUTER_MODEL: env("OPENROUTER_MODEL", "anthropic/claude-3.5-sonnet"),
|
||||||
|
|
||||||
|
getModel(): string {
|
||||||
|
const provider = Config.LLM_PROVIDER;
|
||||||
|
if (provider === "groq") return Config.GROQ_MODEL;
|
||||||
|
if (provider === "bedrock") return Config.BEDROCK_MODEL;
|
||||||
|
if (provider === "openrouter") return Config.OPENROUTER_MODEL;
|
||||||
|
return Config.OPENAI_MODEL;
|
||||||
|
},
|
||||||
|
|
||||||
|
validate(): void {
|
||||||
|
const provider = Config.LLM_PROVIDER;
|
||||||
|
if (provider === "groq" && !Config.GROQ_API_KEY) {
|
||||||
|
throw new Error("GROQ_API_KEY is required when using Groq provider");
|
||||||
|
}
|
||||||
|
if (provider === "openai" && !Config.OPENAI_API_KEY) {
|
||||||
|
throw new Error("OPENAI_API_KEY is required when using OpenAI provider");
|
||||||
|
}
|
||||||
|
if (provider === "openrouter" && !Config.OPENROUTER_API_KEY) {
|
||||||
|
throw new Error("OPENROUTER_API_KEY is required when using OpenRouter provider");
|
||||||
|
}
|
||||||
|
// Bedrock uses AWS credential chain, no explicit validation needed
|
||||||
|
},
|
||||||
|
};
|
||||||
78
android-action-kernel/src/constants.ts
Normal file
78
android-action-kernel/src/constants.ts
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
/**
|
||||||
|
* Constants for Android Action Kernel.
|
||||||
|
* All magic strings, URLs, and fixed values in one place.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// API Endpoints
|
||||||
|
// ===========================================
|
||||||
|
export const GROQ_API_BASE_URL = "https://api.groq.com/openai/v1";
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// ADB Key Codes
|
||||||
|
// ===========================================
|
||||||
|
export const KEYCODE_ENTER = "66";
|
||||||
|
export const KEYCODE_HOME = "KEYCODE_HOME";
|
||||||
|
export const KEYCODE_BACK = "KEYCODE_BACK";
|
||||||
|
export const KEYCODE_DEL = "67";
|
||||||
|
export const KEYCODE_FORWARD_DEL = "112";
|
||||||
|
export const KEYCODE_MOVE_HOME = "122";
|
||||||
|
export const KEYCODE_MOVE_END = "123";
|
||||||
|
export const KEYCODE_MENU = "82";
|
||||||
|
export const KEYCODE_TAB = "61";
|
||||||
|
export const KEYCODE_ESCAPE = "111";
|
||||||
|
export const KEYCODE_DPAD_UP = "19";
|
||||||
|
export const KEYCODE_DPAD_DOWN = "20";
|
||||||
|
export const KEYCODE_DPAD_LEFT = "21";
|
||||||
|
export const KEYCODE_DPAD_RIGHT = "22";
|
||||||
|
export const KEYCODE_VOLUME_UP = "24";
|
||||||
|
export const KEYCODE_VOLUME_DOWN = "25";
|
||||||
|
export const KEYCODE_POWER = "26";
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// Default Screen Coordinates (for swipe actions)
|
||||||
|
// Adjust based on target device resolution
|
||||||
|
// ===========================================
|
||||||
|
export const SCREEN_CENTER_X = 540;
|
||||||
|
export const SCREEN_CENTER_Y = 1200;
|
||||||
|
|
||||||
|
// Swipe coordinates: [start_x, start_y, end_x, end_y]
|
||||||
|
export const SWIPE_COORDS: Record<string, [number, number, number, number]> = {
|
||||||
|
up: [SCREEN_CENTER_X, 1500, SCREEN_CENTER_X, 500],
|
||||||
|
down: [SCREEN_CENTER_X, 500, SCREEN_CENTER_X, 1500],
|
||||||
|
left: [800, SCREEN_CENTER_Y, 200, SCREEN_CENTER_Y],
|
||||||
|
right: [200, SCREEN_CENTER_Y, 800, SCREEN_CENTER_Y],
|
||||||
|
};
|
||||||
|
export const SWIPE_DURATION_MS = "300";
|
||||||
|
export const LONG_PRESS_DURATION_MS = "1000";
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// Default Models
|
||||||
|
// ===========================================
|
||||||
|
export const DEFAULT_GROQ_MODEL = "llama-3.3-70b-versatile";
|
||||||
|
export const DEFAULT_OPENAI_MODEL = "gpt-4o";
|
||||||
|
export const DEFAULT_BEDROCK_MODEL = "us.meta.llama3-3-70b-instruct-v1:0";
|
||||||
|
export const DEFAULT_OPENROUTER_MODEL = "anthropic/claude-3.5-sonnet";
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// Bedrock Model Identifiers
|
||||||
|
// ===========================================
|
||||||
|
export const BEDROCK_ANTHROPIC_MODELS = ["anthropic"];
|
||||||
|
export const BEDROCK_META_MODELS = ["meta", "llama"];
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// File Paths
|
||||||
|
// ===========================================
|
||||||
|
export const DEVICE_DUMP_PATH = "/sdcard/window_dump.xml";
|
||||||
|
export const LOCAL_DUMP_PATH = "window_dump.xml";
|
||||||
|
export const DEVICE_SCREENSHOT_PATH = "/sdcard/kernel_screenshot.png";
|
||||||
|
export const LOCAL_SCREENSHOT_PATH = "kernel_screenshot.png";
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// Agent Defaults
|
||||||
|
// ===========================================
|
||||||
|
export const DEFAULT_MAX_STEPS = 30;
|
||||||
|
export const DEFAULT_STEP_DELAY = 2.0;
|
||||||
|
export const DEFAULT_MAX_RETRIES = 3;
|
||||||
|
export const DEFAULT_STUCK_THRESHOLD = 3;
|
||||||
|
export const DEFAULT_VISION_ENABLED = true;
|
||||||
298
android-action-kernel/src/kernel.ts
Normal file
298
android-action-kernel/src/kernel.ts
Normal file
@@ -0,0 +1,298 @@
|
|||||||
|
/**
|
||||||
|
* Android Action Kernel - Main Agent Loop (TypeScript/Bun Edition)
|
||||||
|
*
|
||||||
|
* An AI agent that controls Android devices through the accessibility API.
|
||||||
|
* Uses LLMs to make decisions based on screen context.
|
||||||
|
*
|
||||||
|
* Features:
|
||||||
|
* - Perception → Reasoning → Action loop
|
||||||
|
* - Screen state diffing (stuck loop detection)
|
||||||
|
* - Error recovery with retries
|
||||||
|
* - Vision fallback when accessibility tree is empty
|
||||||
|
* - Dynamic early exit on goal completion
|
||||||
|
* - 15 actions: tap, type, enter, swipe, home, back, wait, done,
|
||||||
|
* longpress, screenshot, launch, clear, clipboard_get, clipboard_set, shell
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* bun run src/kernel.ts
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { existsSync, readFileSync } from "fs";
|
||||||
|
|
||||||
|
import { Config } from "./config.js";
|
||||||
|
import {
|
||||||
|
executeAction,
|
||||||
|
runAdbCommand,
|
||||||
|
type ActionDecision,
|
||||||
|
type ActionResult,
|
||||||
|
} from "./actions.js";
|
||||||
|
import { getLlmProvider, type LLMProvider } from "./llm-providers.js";
|
||||||
|
import {
|
||||||
|
getInteractiveElements,
|
||||||
|
computeScreenHash,
|
||||||
|
type UIElement,
|
||||||
|
} from "./sanitizer.js";
|
||||||
|
import {
|
||||||
|
DEVICE_SCREENSHOT_PATH,
|
||||||
|
LOCAL_SCREENSHOT_PATH,
|
||||||
|
} from "./constants.js";
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// Screen Perception
|
||||||
|
// ===========================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Dumps the current UI XML and returns parsed elements + JSON string.
|
||||||
|
*/
|
||||||
|
function getScreenState(): { elements: UIElement[]; json: string } {
|
||||||
|
try {
|
||||||
|
runAdbCommand(["shell", "uiautomator", "dump", Config.SCREEN_DUMP_PATH]);
|
||||||
|
runAdbCommand(["pull", Config.SCREEN_DUMP_PATH, Config.LOCAL_DUMP_PATH]);
|
||||||
|
} catch {
|
||||||
|
console.log("Warning: ADB screen capture failed.");
|
||||||
|
return { elements: [], json: "Error: Could not capture screen." };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!existsSync(Config.LOCAL_DUMP_PATH)) {
|
||||||
|
return { elements: [], json: "Error: Could not capture screen." };
|
||||||
|
}
|
||||||
|
|
||||||
|
const xmlContent = readFileSync(Config.LOCAL_DUMP_PATH, "utf-8");
|
||||||
|
const elements = getInteractiveElements(xmlContent);
|
||||||
|
return { elements, json: JSON.stringify(elements, null, 2) };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Captures a screenshot and returns the local file path.
|
||||||
|
* Used as a vision fallback when the accessibility tree is empty.
|
||||||
|
*/
|
||||||
|
function captureScreenshot(): string | null {
|
||||||
|
try {
|
||||||
|
runAdbCommand(["shell", "screencap", "-p", DEVICE_SCREENSHOT_PATH]);
|
||||||
|
runAdbCommand(["pull", DEVICE_SCREENSHOT_PATH, LOCAL_SCREENSHOT_PATH]);
|
||||||
|
if (existsSync(LOCAL_SCREENSHOT_PATH)) {
|
||||||
|
return LOCAL_SCREENSHOT_PATH;
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
console.log("Warning: Screenshot capture failed.");
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// Screen State Diffing
|
||||||
|
// ===========================================
|
||||||
|
|
||||||
|
interface ScreenDiff {
|
||||||
|
changed: boolean;
|
||||||
|
addedTexts: string[];
|
||||||
|
removedTexts: string[];
|
||||||
|
summary: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
function diffScreenState(
|
||||||
|
prevElements: UIElement[],
|
||||||
|
currElements: UIElement[]
|
||||||
|
): ScreenDiff {
|
||||||
|
const prevTexts = new Set(prevElements.map((e) => e.text).filter(Boolean));
|
||||||
|
const currTexts = new Set(currElements.map((e) => e.text).filter(Boolean));
|
||||||
|
|
||||||
|
const addedTexts = [...currTexts].filter((t) => !prevTexts.has(t));
|
||||||
|
const removedTexts = [...prevTexts].filter((t) => !currTexts.has(t));
|
||||||
|
|
||||||
|
const prevHash = computeScreenHash(prevElements);
|
||||||
|
const currHash = computeScreenHash(currElements);
|
||||||
|
const changed = prevHash !== currHash;
|
||||||
|
|
||||||
|
let summary = "";
|
||||||
|
if (!changed) {
|
||||||
|
summary = "Screen has NOT changed since last action.";
|
||||||
|
} else {
|
||||||
|
const parts: string[] = [];
|
||||||
|
if (addedTexts.length > 0) {
|
||||||
|
parts.push(`New on screen: ${addedTexts.slice(0, 5).join(", ")}`);
|
||||||
|
}
|
||||||
|
if (removedTexts.length > 0) {
|
||||||
|
parts.push(`Gone from screen: ${removedTexts.slice(0, 5).join(", ")}`);
|
||||||
|
}
|
||||||
|
summary = parts.join(". ") || "Screen layout changed.";
|
||||||
|
}
|
||||||
|
|
||||||
|
return { changed, addedTexts, removedTexts, summary };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// Action History Formatting
|
||||||
|
// ===========================================
|
||||||
|
|
||||||
|
function formatActionHistory(
|
||||||
|
actionHistory: ActionDecision[],
|
||||||
|
resultHistory: ActionResult[]
|
||||||
|
): string {
|
||||||
|
if (actionHistory.length === 0) return "";
|
||||||
|
|
||||||
|
const lines = actionHistory.map((entry, i) => {
|
||||||
|
const actionType = entry.action ?? "unknown";
|
||||||
|
const reason = entry.reason ?? "N/A";
|
||||||
|
const result = resultHistory[i];
|
||||||
|
const outcome = result ? (result.success ? "OK" : "FAILED") : "";
|
||||||
|
|
||||||
|
if (actionType === "type") {
|
||||||
|
return `Step ${i + 1}: typed "${entry.text ?? ""}" - ${reason} [${outcome}]`;
|
||||||
|
}
|
||||||
|
if (actionType === "tap") {
|
||||||
|
return `Step ${i + 1}: tapped ${JSON.stringify(entry.coordinates ?? [])} - ${reason} [${outcome}]`;
|
||||||
|
}
|
||||||
|
if (actionType === "launch") {
|
||||||
|
return `Step ${i + 1}: launched ${entry.package ?? entry.uri ?? ""} - ${reason} [${outcome}]`;
|
||||||
|
}
|
||||||
|
if (actionType === "screenshot") {
|
||||||
|
return `Step ${i + 1}: took screenshot - ${reason} [${outcome}]`;
|
||||||
|
}
|
||||||
|
return `Step ${i + 1}: ${actionType} - ${reason} [${outcome}]`;
|
||||||
|
});
|
||||||
|
|
||||||
|
return "\n\nPREVIOUS_ACTIONS:\n" + lines.join("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// Main Agent Loop
|
||||||
|
// ===========================================
|
||||||
|
|
||||||
|
async function runAgent(goal: string, maxSteps?: number): Promise<void> {
|
||||||
|
const steps = maxSteps ?? Config.MAX_STEPS;
|
||||||
|
|
||||||
|
console.log("Android Action Kernel Started");
|
||||||
|
console.log(`Goal: ${goal}`);
|
||||||
|
console.log(`Provider: ${Config.LLM_PROVIDER} (${Config.getModel()})`);
|
||||||
|
console.log(`Max steps: ${steps} | Step delay: ${Config.STEP_DELAY}s`);
|
||||||
|
console.log(`Vision fallback: ${Config.VISION_ENABLED ? "ON" : "OFF"}`);
|
||||||
|
|
||||||
|
const llm = getLlmProvider();
|
||||||
|
const actionHistory: ActionDecision[] = [];
|
||||||
|
const resultHistory: ActionResult[] = [];
|
||||||
|
let prevElements: UIElement[] = [];
|
||||||
|
let stuckCount = 0;
|
||||||
|
|
||||||
|
for (let step = 0; step < steps; step++) {
|
||||||
|
console.log(`\n--- Step ${step + 1}/${steps} ---`);
|
||||||
|
|
||||||
|
// 1. Perception: Capture screen state
|
||||||
|
console.log("Scanning screen...");
|
||||||
|
const { elements, json: screenContext } = getScreenState();
|
||||||
|
|
||||||
|
// 2. Screen diff: detect stuck loops
|
||||||
|
let diffContext = "";
|
||||||
|
if (step > 0) {
|
||||||
|
const diff = diffScreenState(prevElements, elements);
|
||||||
|
diffContext = `\n\nSCREEN_CHANGE: ${diff.summary}`;
|
||||||
|
|
||||||
|
if (!diff.changed) {
|
||||||
|
stuckCount++;
|
||||||
|
console.log(
|
||||||
|
`Warning: Screen unchanged for ${stuckCount} step(s).`
|
||||||
|
);
|
||||||
|
if (stuckCount >= Config.STUCK_THRESHOLD) {
|
||||||
|
console.log(
|
||||||
|
`Stuck for ${stuckCount} steps. Injecting recovery hint.`
|
||||||
|
);
|
||||||
|
diffContext +=
|
||||||
|
`\nWARNING: You have been stuck for ${stuckCount} steps. ` +
|
||||||
|
`The screen is NOT changing. Try a DIFFERENT action: ` +
|
||||||
|
`swipe to scroll, press back, go home, or launch a different app.`;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
stuckCount = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
prevElements = elements;
|
||||||
|
|
||||||
|
// 3. Vision fallback: if accessibility tree is empty, use screenshot
|
||||||
|
let visionContext = "";
|
||||||
|
if (elements.length === 0 && Config.VISION_ENABLED) {
|
||||||
|
console.log("Accessibility tree empty. Attempting vision fallback...");
|
||||||
|
const screenshotPath = captureScreenshot();
|
||||||
|
if (screenshotPath) {
|
||||||
|
visionContext =
|
||||||
|
"\n\nVISION_FALLBACK: The accessibility tree returned NO elements. " +
|
||||||
|
"A screenshot has been captured. The screen likely contains custom-drawn " +
|
||||||
|
"content (game, WebView, or Flutter). Try using coordinate-based taps on " +
|
||||||
|
"common UI positions, or use 'back'/'home' to navigate away. " +
|
||||||
|
"If you know the app package name, use 'launch' to restart it.";
|
||||||
|
console.log("Vision fallback: screenshot captured for context.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. Reasoning: Get LLM decision
|
||||||
|
console.log("Thinking...");
|
||||||
|
const historyStr = formatActionHistory(actionHistory, resultHistory);
|
||||||
|
const fullContext = screenContext + historyStr + diffContext + visionContext;
|
||||||
|
|
||||||
|
let decision: ActionDecision;
|
||||||
|
try {
|
||||||
|
decision = await llm.getDecision(goal, fullContext, actionHistory);
|
||||||
|
} catch (err) {
|
||||||
|
console.log(`LLM Error: ${(err as Error).message}`);
|
||||||
|
console.log("Falling back to wait action.");
|
||||||
|
decision = { action: "wait", reason: "LLM request failed, waiting for retry" };
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Decision: ${decision.action} — ${decision.reason ?? "no reason"}`);
|
||||||
|
|
||||||
|
// 5. Action: Execute the decision
|
||||||
|
let result: ActionResult;
|
||||||
|
try {
|
||||||
|
result = executeAction(decision);
|
||||||
|
} catch (err) {
|
||||||
|
console.log(`Action Error: ${(err as Error).message}`);
|
||||||
|
result = { success: false, message: (err as Error).message };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Track history
|
||||||
|
actionHistory.push(decision);
|
||||||
|
resultHistory.push(result);
|
||||||
|
|
||||||
|
// 6. Check for goal completion
|
||||||
|
if (decision.action === "done") {
|
||||||
|
console.log("\nTask completed successfully.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for UI to update
|
||||||
|
await Bun.sleep(Config.STEP_DELAY * 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log("\nMax steps reached. Task may be incomplete.");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// Entry Point
|
||||||
|
// ===========================================
|
||||||
|
|
||||||
|
async function main(): Promise<void> {
|
||||||
|
try {
|
||||||
|
Config.validate();
|
||||||
|
} catch (e) {
|
||||||
|
console.log(`Configuration Error: ${(e as Error).message}`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read user input from stdin
|
||||||
|
process.stdout.write("Enter your goal: ");
|
||||||
|
const goal = await new Promise<string>((resolve) => {
|
||||||
|
const reader = Bun.stdin.stream().getReader();
|
||||||
|
reader.read().then(({ value }) => {
|
||||||
|
resolve(new TextDecoder().decode(value).trim());
|
||||||
|
reader.releaseLock();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!goal) {
|
||||||
|
console.log("No goal provided. Exiting.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
await runAgent(goal);
|
||||||
|
}
|
||||||
|
|
||||||
|
main();
|
||||||
327
android-action-kernel/src/llm-providers.ts
Normal file
327
android-action-kernel/src/llm-providers.ts
Normal file
@@ -0,0 +1,327 @@
|
|||||||
|
/**
|
||||||
|
* LLM Provider module for Android Action Kernel.
|
||||||
|
* Supports OpenAI, Groq, AWS Bedrock, and OpenRouter (via Vercel AI SDK).
|
||||||
|
*/
|
||||||
|
|
||||||
|
import OpenAI from "openai";
|
||||||
|
import {
|
||||||
|
BedrockRuntimeClient,
|
||||||
|
InvokeModelCommand,
|
||||||
|
} from "@aws-sdk/client-bedrock-runtime";
|
||||||
|
import { generateText } from "ai";
|
||||||
|
import { createOpenRouter } from "@openrouter/ai-sdk-provider";
|
||||||
|
|
||||||
|
import { Config } from "./config.js";
|
||||||
|
import {
|
||||||
|
GROQ_API_BASE_URL,
|
||||||
|
BEDROCK_ANTHROPIC_MODELS,
|
||||||
|
BEDROCK_META_MODELS,
|
||||||
|
} from "./constants.js";
|
||||||
|
import type { ActionDecision } from "./actions.js";
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// System Prompt — all 15 actions + rich element context
|
||||||
|
// ===========================================
|
||||||
|
|
||||||
|
const SYSTEM_PROMPT = `You are an Android Driver Agent. Your job is to achieve the user's goal by navigating the Android UI.
|
||||||
|
|
||||||
|
You will receive:
|
||||||
|
1. GOAL — the user's task.
|
||||||
|
2. SCREEN_CONTEXT — JSON array of interactive UI elements with coordinates, states, and hierarchy.
|
||||||
|
3. PREVIOUS_ACTIONS — your action history with outcomes (OK/FAILED).
|
||||||
|
4. SCREEN_CHANGE — what changed since your last action (or if the screen is stuck).
|
||||||
|
5. VISION_FALLBACK — present when the accessibility tree is empty (custom UI / WebView).
|
||||||
|
|
||||||
|
You must output ONLY a valid JSON object with your next action.
|
||||||
|
|
||||||
|
═══════════════════════════════════════════
|
||||||
|
AVAILABLE ACTIONS (15 total)
|
||||||
|
═══════════════════════════════════════════
|
||||||
|
|
||||||
|
Navigation:
|
||||||
|
{"action": "tap", "coordinates": [x, y], "reason": "..."}
|
||||||
|
{"action": "longpress", "coordinates": [x, y], "reason": "..."}
|
||||||
|
{"action": "swipe", "direction": "up|down|left|right", "reason": "..."}
|
||||||
|
{"action": "enter", "reason": "Press Enter/submit"}
|
||||||
|
{"action": "back", "reason": "Navigate back"}
|
||||||
|
{"action": "home", "reason": "Go to home screen"}
|
||||||
|
|
||||||
|
Text Input:
|
||||||
|
{"action": "type", "text": "Hello World", "reason": "..."}
|
||||||
|
{"action": "clear", "reason": "Clear current text field before typing"}
|
||||||
|
|
||||||
|
App Control:
|
||||||
|
{"action": "launch", "package": "com.whatsapp", "reason": "Open WhatsApp"}
|
||||||
|
{"action": "launch", "uri": "https://maps.google.com/?q=pizza", "reason": "Open URL"}
|
||||||
|
{"action": "launch", "package": "com.whatsapp", "uri": "content://media/external/images/1", "extras": {"android.intent.extra.TEXT": "Check this"}, "reason": "Share image to WhatsApp"}
|
||||||
|
|
||||||
|
Data:
|
||||||
|
{"action": "screenshot", "reason": "Capture current screen"}
|
||||||
|
{"action": "screenshot", "filename": "order_confirmation.png", "reason": "Save proof"}
|
||||||
|
{"action": "clipboard_get", "reason": "Read clipboard contents"}
|
||||||
|
{"action": "clipboard_set", "text": "copied text", "reason": "Set clipboard"}
|
||||||
|
|
||||||
|
System:
|
||||||
|
{"action": "shell", "command": "am force-stop com.app.broken", "reason": "Kill crashed app"}
|
||||||
|
{"action": "wait", "reason": "Wait for screen to load"}
|
||||||
|
{"action": "done", "reason": "Task is complete"}
|
||||||
|
|
||||||
|
═══════════════════════════════════════════
|
||||||
|
ELEMENT PROPERTIES YOU WILL SEE
|
||||||
|
═══════════════════════════════════════════
|
||||||
|
|
||||||
|
Each element in SCREEN_CONTEXT has:
|
||||||
|
- text: visible label or content description
|
||||||
|
- center: [x, y] coordinates to tap
|
||||||
|
- size: [width, height] in pixels
|
||||||
|
- enabled: whether the element can be interacted with (DO NOT tap disabled elements!)
|
||||||
|
- checked: checkbox/toggle state (true = ON)
|
||||||
|
- focused: whether this field currently has input focus
|
||||||
|
- selected: whether this item is currently selected (tabs, list items)
|
||||||
|
- scrollable: whether this container can be scrolled
|
||||||
|
- longClickable: supports long-press for context menu
|
||||||
|
- editable: text input field
|
||||||
|
- password: password input (don't read/log the text)
|
||||||
|
- hint: placeholder text shown when field is empty
|
||||||
|
- parent: the containing element (helps understand layout hierarchy)
|
||||||
|
- action: suggested action — "tap", "type", "longpress", "scroll", or "read"
|
||||||
|
|
||||||
|
═══════════════════════════════════════════
|
||||||
|
CRITICAL RULES
|
||||||
|
═══════════════════════════════════════════
|
||||||
|
|
||||||
|
1. DISABLED ELEMENTS: If "enabled": false, DO NOT tap or interact with it. Find an alternative.
|
||||||
|
2. TEXT INPUT: If "editable": true, use "clear" first if field has existing text, then "type".
|
||||||
|
3. ALREADY TYPED: Check PREVIOUS_ACTIONS. Do NOT re-type text you already entered.
|
||||||
|
4. REPETITION: Do NOT tap the same coordinates twice in a row. If it didn't work, try something else.
|
||||||
|
5. STUCK: If SCREEN_CHANGE says "NOT changed", your last action had no effect. Change strategy.
|
||||||
|
6. APP LAUNCH: Use "launch" to directly open apps instead of hunting for icons on the home screen.
|
||||||
|
7. SCREENSHOTS: Use "screenshot" to capture proof of completed tasks (order confirmations, etc).
|
||||||
|
8. LONG PRESS: Use "longpress" when you see "longClickable": true (context menus, copy/paste, etc).
|
||||||
|
9. SCROLLING: If the item you need isn't visible, "swipe" up/down to scroll and find it.
|
||||||
|
10. MULTI-APP: To switch apps, use "home" then "launch" the next app. Or use "back" to return.
|
||||||
|
11. PASSWORDS: Never log or output the text of password fields.
|
||||||
|
12. DONE: Say "done" as soon as the goal is achieved. Don't keep acting after success.
|
||||||
|
13. SEARCH: After typing in a search field, use "enter" to submit the search.
|
||||||
|
14. SHARE: To send files/images between apps, use "launch" with uri + extras for Android intents.
|
||||||
|
15. CLEANUP: If a popup/ad appears, dismiss it with "back" or tap the close button, then continue.`;
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// Provider Interface
|
||||||
|
// ===========================================
|
||||||
|
|
||||||
|
interface ActionHistoryEntry {
|
||||||
|
action?: string;
|
||||||
|
reason?: string;
|
||||||
|
text?: string;
|
||||||
|
coordinates?: [number, number];
|
||||||
|
package?: string;
|
||||||
|
uri?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface LLMProvider {
|
||||||
|
getDecision(
|
||||||
|
goal: string,
|
||||||
|
screenContext: string,
|
||||||
|
actionHistory: ActionHistoryEntry[]
|
||||||
|
): Promise<ActionDecision>;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// OpenAI / Groq Provider
|
||||||
|
// ===========================================
|
||||||
|
|
||||||
|
class OpenAIProvider implements LLMProvider {
|
||||||
|
private client: OpenAI;
|
||||||
|
private model: string;
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
if (Config.LLM_PROVIDER === "groq") {
|
||||||
|
this.client = new OpenAI({
|
||||||
|
apiKey: Config.GROQ_API_KEY,
|
||||||
|
baseURL: GROQ_API_BASE_URL,
|
||||||
|
});
|
||||||
|
this.model = Config.GROQ_MODEL;
|
||||||
|
} else {
|
||||||
|
this.client = new OpenAI({ apiKey: Config.OPENAI_API_KEY });
|
||||||
|
this.model = Config.OPENAI_MODEL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async getDecision(
|
||||||
|
goal: string,
|
||||||
|
screenContext: string,
|
||||||
|
_actionHistory: ActionHistoryEntry[]
|
||||||
|
): Promise<ActionDecision> {
|
||||||
|
// screenContext now includes history, diff, and vision context from kernel
|
||||||
|
const userContent = `GOAL: ${goal}\n\nSCREEN_CONTEXT:\n${screenContext}`;
|
||||||
|
|
||||||
|
const response = await this.client.chat.completions.create({
|
||||||
|
model: this.model,
|
||||||
|
response_format: { type: "json_object" },
|
||||||
|
messages: [
|
||||||
|
{ role: "system", content: SYSTEM_PROMPT },
|
||||||
|
{ role: "user", content: userContent },
|
||||||
|
],
|
||||||
|
});
|
||||||
|
|
||||||
|
return JSON.parse(response.choices[0].message.content ?? "{}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// OpenRouter Provider (Vercel AI SDK)
|
||||||
|
// ===========================================
|
||||||
|
|
||||||
|
class OpenRouterProvider implements LLMProvider {
|
||||||
|
private openrouter: ReturnType<typeof createOpenRouter>;
|
||||||
|
private model: string;
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
this.openrouter = createOpenRouter({
|
||||||
|
apiKey: Config.OPENROUTER_API_KEY,
|
||||||
|
});
|
||||||
|
this.model = Config.OPENROUTER_MODEL;
|
||||||
|
}
|
||||||
|
|
||||||
|
async getDecision(
|
||||||
|
goal: string,
|
||||||
|
screenContext: string,
|
||||||
|
_actionHistory: ActionHistoryEntry[]
|
||||||
|
): Promise<ActionDecision> {
|
||||||
|
const userContent = `GOAL: ${goal}\n\nSCREEN_CONTEXT:\n${screenContext}`;
|
||||||
|
|
||||||
|
const result = await generateText({
|
||||||
|
model: this.openrouter.chat(this.model),
|
||||||
|
system: SYSTEM_PROMPT,
|
||||||
|
prompt: userContent + "\n\nRespond with ONLY a valid JSON object.",
|
||||||
|
});
|
||||||
|
|
||||||
|
return parseJsonResponse(result.text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// AWS Bedrock Provider
|
||||||
|
// ===========================================
|
||||||
|
|
||||||
|
class BedrockProvider implements LLMProvider {
|
||||||
|
private client: BedrockRuntimeClient;
|
||||||
|
private model: string;
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
this.client = new BedrockRuntimeClient({ region: Config.AWS_REGION });
|
||||||
|
this.model = Config.BEDROCK_MODEL;
|
||||||
|
}
|
||||||
|
|
||||||
|
async getDecision(
|
||||||
|
goal: string,
|
||||||
|
screenContext: string,
|
||||||
|
_actionHistory: ActionHistoryEntry[]
|
||||||
|
): Promise<ActionDecision> {
|
||||||
|
const userContent = `GOAL: ${goal}\n\nSCREEN_CONTEXT:\n${screenContext}`;
|
||||||
|
const requestBody = this.buildRequest(userContent);
|
||||||
|
|
||||||
|
const command = new InvokeModelCommand({
|
||||||
|
modelId: this.model,
|
||||||
|
body: new TextEncoder().encode(requestBody),
|
||||||
|
contentType: "application/json",
|
||||||
|
accept: "application/json",
|
||||||
|
});
|
||||||
|
|
||||||
|
const response = await this.client.send(command);
|
||||||
|
const responseBody = JSON.parse(new TextDecoder().decode(response.body));
|
||||||
|
const resultText = this.extractResponse(responseBody);
|
||||||
|
|
||||||
|
return parseJsonResponse(resultText);
|
||||||
|
}
|
||||||
|
|
||||||
|
private isAnthropicModel(): boolean {
|
||||||
|
return BEDROCK_ANTHROPIC_MODELS.some((id) => this.model.includes(id));
|
||||||
|
}
|
||||||
|
|
||||||
|
private isMetaModel(): boolean {
|
||||||
|
return BEDROCK_META_MODELS.some((id) =>
|
||||||
|
this.model.toLowerCase().includes(id)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
private buildRequest(userContent: string): string {
|
||||||
|
if (this.isAnthropicModel()) {
|
||||||
|
return JSON.stringify({
|
||||||
|
anthropic_version: "bedrock-2023-05-31",
|
||||||
|
max_tokens: 1024,
|
||||||
|
system: SYSTEM_PROMPT,
|
||||||
|
messages: [
|
||||||
|
{
|
||||||
|
role: "user",
|
||||||
|
content:
|
||||||
|
userContent + "\n\nRespond with ONLY a valid JSON object.",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.isMetaModel()) {
|
||||||
|
return JSON.stringify({
|
||||||
|
prompt: `<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n${SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n${userContent}\n\nRespond with ONLY a valid JSON object, no other text.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n`,
|
||||||
|
max_gen_len: 512,
|
||||||
|
temperature: 0.1,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return JSON.stringify({
|
||||||
|
inputText: `${SYSTEM_PROMPT}\n\n${userContent}\n\nRespond with ONLY a valid JSON object.`,
|
||||||
|
textGenerationConfig: {
|
||||||
|
maxTokenCount: 512,
|
||||||
|
temperature: 0.1,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private extractResponse(responseBody: Record<string, any>): string {
|
||||||
|
if (this.isAnthropicModel()) {
|
||||||
|
return responseBody.content[0].text;
|
||||||
|
}
|
||||||
|
if (this.isMetaModel()) {
|
||||||
|
return responseBody.generation ?? "";
|
||||||
|
}
|
||||||
|
return responseBody.results[0].outputText;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// Shared JSON Parsing
|
||||||
|
// ===========================================
|
||||||
|
|
||||||
|
function parseJsonResponse(text: string): ActionDecision {
|
||||||
|
try {
|
||||||
|
return JSON.parse(text);
|
||||||
|
} catch {
|
||||||
|
// Try to extract JSON from markdown code blocks or mixed text
|
||||||
|
const match = text.match(/\{[\s\S]*?\}/);
|
||||||
|
if (match) {
|
||||||
|
try {
|
||||||
|
return JSON.parse(match[0]);
|
||||||
|
} catch {
|
||||||
|
// fall through
|
||||||
|
}
|
||||||
|
}
|
||||||
|
console.log(`Warning: Could not parse LLM response: ${text.slice(0, 200)}`);
|
||||||
|
return { action: "wait", reason: "Failed to parse response, waiting" };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// Factory
|
||||||
|
// ===========================================
|
||||||
|
|
||||||
|
export function getLlmProvider(): LLMProvider {
|
||||||
|
if (Config.LLM_PROVIDER === "bedrock") {
|
||||||
|
return new BedrockProvider();
|
||||||
|
}
|
||||||
|
if (Config.LLM_PROVIDER === "openrouter") {
|
||||||
|
return new OpenRouterProvider();
|
||||||
|
}
|
||||||
|
return new OpenAIProvider();
|
||||||
|
}
|
||||||
171
android-action-kernel/src/sanitizer.ts
Normal file
171
android-action-kernel/src/sanitizer.ts
Normal file
@@ -0,0 +1,171 @@
|
|||||||
|
/**
|
||||||
|
* XML Sanitizer for Android Action Kernel.
|
||||||
|
* Parses Android Accessibility XML and extracts interactive UI elements
|
||||||
|
* with full state information and parent-child hierarchy context.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { XMLParser } from "fast-xml-parser";
|
||||||
|
|
||||||
|
export interface UIElement {
|
||||||
|
id: string;
|
||||||
|
text: string;
|
||||||
|
type: string;
|
||||||
|
bounds: string;
|
||||||
|
center: [number, number];
|
||||||
|
size: [number, number];
|
||||||
|
clickable: boolean;
|
||||||
|
editable: boolean;
|
||||||
|
enabled: boolean;
|
||||||
|
checked: boolean;
|
||||||
|
focused: boolean;
|
||||||
|
selected: boolean;
|
||||||
|
scrollable: boolean;
|
||||||
|
longClickable: boolean;
|
||||||
|
password: boolean;
|
||||||
|
hint: string;
|
||||||
|
action: "tap" | "type" | "longpress" | "scroll" | "read";
|
||||||
|
parent: string;
|
||||||
|
depth: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compute a hash of element texts/ids for screen state comparison.
|
||||||
|
*/
|
||||||
|
export function computeScreenHash(elements: UIElement[]): string {
|
||||||
|
const parts = elements.map(
|
||||||
|
(e) => `${e.id}|${e.text}|${e.center[0]},${e.center[1]}|${e.enabled}|${e.checked}`
|
||||||
|
);
|
||||||
|
return parts.join(";");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses Android Accessibility XML and returns a rich list of interactive elements.
|
||||||
|
* Preserves state (enabled, checked, focused) and hierarchy context.
|
||||||
|
*/
|
||||||
|
export function getInteractiveElements(xmlContent: string): UIElement[] {
|
||||||
|
const parser = new XMLParser({
|
||||||
|
ignoreAttributes: false,
|
||||||
|
attributeNamePrefix: "@_",
|
||||||
|
allowBooleanAttributes: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
let parsed: unknown;
|
||||||
|
try {
|
||||||
|
parsed = parser.parse(xmlContent);
|
||||||
|
} catch {
|
||||||
|
console.log("Warning: Error parsing XML. The screen might be loading.");
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const elements: UIElement[] = [];
|
||||||
|
|
||||||
|
function walk(node: any, parentLabel: string, depth: number): void {
|
||||||
|
if (!node || typeof node !== "object") return;
|
||||||
|
|
||||||
|
if (node["@_bounds"]) {
|
||||||
|
const isClickable = node["@_clickable"] === "true";
|
||||||
|
const isLongClickable = node["@_long-clickable"] === "true";
|
||||||
|
const isScrollable = node["@_scrollable"] === "true";
|
||||||
|
const isEnabled = node["@_enabled"] !== "false"; // default true
|
||||||
|
const isChecked = node["@_checked"] === "true";
|
||||||
|
const isFocused = node["@_focused"] === "true";
|
||||||
|
const isSelected = node["@_selected"] === "true";
|
||||||
|
const isPassword = node["@_password"] === "true";
|
||||||
|
|
||||||
|
const elementClass = node["@_class"] ?? "";
|
||||||
|
const isEditable =
|
||||||
|
elementClass.includes("EditText") ||
|
||||||
|
elementClass.includes("AutoCompleteTextView") ||
|
||||||
|
node["@_editable"] === "true";
|
||||||
|
|
||||||
|
const text: string = node["@_text"] ?? "";
|
||||||
|
const desc: string = node["@_content-desc"] ?? "";
|
||||||
|
const resourceId: string = node["@_resource-id"] ?? "";
|
||||||
|
const hint: string = node["@_hint"] ?? "";
|
||||||
|
|
||||||
|
// Build a label for this node to use as parent context for children
|
||||||
|
const typeName = elementClass.split(".").pop() ?? "";
|
||||||
|
const nodeLabel = text || desc || resourceId.split("/").pop() || typeName;
|
||||||
|
|
||||||
|
// Determine if this element should be included
|
||||||
|
const isInteractive = isClickable || isEditable || isLongClickable || isScrollable;
|
||||||
|
const hasContent = !!(text || desc);
|
||||||
|
|
||||||
|
if (isInteractive || hasContent) {
|
||||||
|
const bounds: string = node["@_bounds"];
|
||||||
|
try {
|
||||||
|
const coords = bounds
|
||||||
|
.replace("][", ",")
|
||||||
|
.replace("[", "")
|
||||||
|
.replace("]", "")
|
||||||
|
.split(",")
|
||||||
|
.map(Number);
|
||||||
|
|
||||||
|
const [x1, y1, x2, y2] = coords;
|
||||||
|
const centerX = Math.floor((x1 + x2) / 2);
|
||||||
|
const centerY = Math.floor((y1 + y2) / 2);
|
||||||
|
const width = x2 - x1;
|
||||||
|
const height = y2 - y1;
|
||||||
|
|
||||||
|
// Skip zero-size elements (invisible)
|
||||||
|
if (width <= 0 || height <= 0) {
|
||||||
|
// still walk children
|
||||||
|
} else {
|
||||||
|
let suggestedAction: UIElement["action"];
|
||||||
|
if (isEditable) suggestedAction = "type";
|
||||||
|
else if (isLongClickable && !isClickable) suggestedAction = "longpress";
|
||||||
|
else if (isScrollable && !isClickable) suggestedAction = "scroll";
|
||||||
|
else if (isClickable) suggestedAction = "tap";
|
||||||
|
else suggestedAction = "read";
|
||||||
|
|
||||||
|
elements.push({
|
||||||
|
id: resourceId,
|
||||||
|
text: text || desc,
|
||||||
|
type: typeName,
|
||||||
|
bounds,
|
||||||
|
center: [centerX, centerY],
|
||||||
|
size: [width, height],
|
||||||
|
clickable: isClickable,
|
||||||
|
editable: isEditable,
|
||||||
|
enabled: isEnabled,
|
||||||
|
checked: isChecked,
|
||||||
|
focused: isFocused,
|
||||||
|
selected: isSelected,
|
||||||
|
scrollable: isScrollable,
|
||||||
|
longClickable: isLongClickable,
|
||||||
|
password: isPassword,
|
||||||
|
hint: hint,
|
||||||
|
action: suggestedAction,
|
||||||
|
parent: parentLabel,
|
||||||
|
depth,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Skip malformed bounds
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Recurse with updated parent label
|
||||||
|
walkChildren(node, nodeLabel, depth + 1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// No bounds on this node — just recurse
|
||||||
|
walkChildren(node, parentLabel, depth);
|
||||||
|
}
|
||||||
|
|
||||||
|
function walkChildren(node: any, parentLabel: string, depth: number): void {
|
||||||
|
if (node.node) {
|
||||||
|
const children = Array.isArray(node.node) ? node.node : [node.node];
|
||||||
|
for (const child of children) {
|
||||||
|
walk(child, parentLabel, depth);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (node.hierarchy) {
|
||||||
|
walk(node.hierarchy, parentLabel, depth);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
walk(parsed, "root", 0);
|
||||||
|
return elements;
|
||||||
|
}
|
||||||
19
android-action-kernel/tsconfig.json
Normal file
19
android-action-kernel/tsconfig.json
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
{
|
||||||
|
"compilerOptions": {
|
||||||
|
"target": "ES2022",
|
||||||
|
"module": "ES2022",
|
||||||
|
"moduleResolution": "bundler",
|
||||||
|
"strict": true,
|
||||||
|
"esModuleInterop": true,
|
||||||
|
"skipLibCheck": true,
|
||||||
|
"outDir": "dist",
|
||||||
|
"rootDir": "src",
|
||||||
|
"types": ["bun-types"],
|
||||||
|
"resolveJsonModule": true,
|
||||||
|
"declaration": true,
|
||||||
|
"declarationMap": true,
|
||||||
|
"sourceMap": true
|
||||||
|
},
|
||||||
|
"include": ["src/**/*.ts"],
|
||||||
|
"exclude": ["node_modules", "dist"]
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user