Flatten project structure: move android-action-kernel/ to root
Removes the unnecessary nesting — all source, config, and docs now live at the project root for simpler paths and commands. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
416
src/kernel.ts
Normal file
416
src/kernel.ts
Normal file
@@ -0,0 +1,416 @@
|
||||
/**
|
||||
* Android Action Kernel - Main Agent Loop (TypeScript/Bun Edition)
|
||||
*
|
||||
* An AI agent that controls Android devices through the accessibility API.
|
||||
* Uses LLMs to make decisions based on screen context.
|
||||
*
|
||||
* Features:
|
||||
* - Perception -> Reasoning -> Action loop
|
||||
* - Screen state diffing (stuck loop detection)
|
||||
* - Error recovery with retries
|
||||
* - Vision fallback & always-on multimodal screenshots
|
||||
* - Dynamic early exit on goal completion
|
||||
* - Smart element filtering (compact JSON, top-N scoring)
|
||||
* - Multi-turn conversation memory
|
||||
* - Multi-step planning (think/plan/planProgress)
|
||||
* - Streaming LLM responses
|
||||
* - Session logging with crash-safe partial writes
|
||||
* - Auto-detect screen resolution & foreground app
|
||||
* - 15 actions: tap, type, enter, swipe, home, back, wait, done,
|
||||
* longpress, screenshot, launch, clear, clipboard_get, clipboard_set, shell
|
||||
*
|
||||
* Usage:
|
||||
* bun run src/kernel.ts
|
||||
*/
|
||||
|
||||
import { existsSync, readFileSync } from "fs";
|
||||
|
||||
import { Config } from "./config.js";
|
||||
import {
|
||||
executeAction,
|
||||
runAdbCommand,
|
||||
getScreenResolution,
|
||||
getForegroundApp,
|
||||
initDeviceContext,
|
||||
type ActionDecision,
|
||||
type ActionResult,
|
||||
} from "./actions.js";
|
||||
import {
|
||||
getLlmProvider,
|
||||
trimMessages,
|
||||
SYSTEM_PROMPT,
|
||||
type LLMProvider,
|
||||
type ChatMessage,
|
||||
type ContentPart,
|
||||
} from "./llm-providers.js";
|
||||
import {
|
||||
getInteractiveElements,
|
||||
computeScreenHash,
|
||||
filterElements,
|
||||
type UIElement,
|
||||
} from "./sanitizer.js";
|
||||
import {
|
||||
DEVICE_SCREENSHOT_PATH,
|
||||
LOCAL_SCREENSHOT_PATH,
|
||||
} from "./constants.js";
|
||||
import { SessionLogger } from "./logger.js";
|
||||
|
||||
// ===========================================
|
||||
// Screen Perception
|
||||
// ===========================================
|
||||
|
||||
interface ScreenState {
|
||||
elements: UIElement[];
|
||||
compactJson: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Dumps the current UI XML and returns parsed elements + compact filtered JSON for the LLM.
|
||||
*/
|
||||
function getScreenState(): ScreenState {
|
||||
try {
|
||||
runAdbCommand(["shell", "uiautomator", "dump", Config.SCREEN_DUMP_PATH]);
|
||||
runAdbCommand(["pull", Config.SCREEN_DUMP_PATH, Config.LOCAL_DUMP_PATH]);
|
||||
} catch {
|
||||
console.log("Warning: ADB screen capture failed.");
|
||||
return { elements: [], compactJson: "Error: Could not capture screen." };
|
||||
}
|
||||
|
||||
if (!existsSync(Config.LOCAL_DUMP_PATH)) {
|
||||
return { elements: [], compactJson: "Error: Could not capture screen." };
|
||||
}
|
||||
|
||||
const xmlContent = readFileSync(Config.LOCAL_DUMP_PATH, "utf-8");
|
||||
const elements = getInteractiveElements(xmlContent);
|
||||
const compact = filterElements(elements, Config.MAX_ELEMENTS);
|
||||
return { elements, compactJson: JSON.stringify(compact) };
|
||||
}
|
||||
|
||||
/**
|
||||
* Captures a screenshot and returns the base64-encoded PNG, or null on failure.
|
||||
*/
|
||||
function captureScreenshotBase64(): string | null {
|
||||
try {
|
||||
runAdbCommand(["shell", "screencap", "-p", DEVICE_SCREENSHOT_PATH]);
|
||||
runAdbCommand(["pull", DEVICE_SCREENSHOT_PATH, LOCAL_SCREENSHOT_PATH]);
|
||||
if (existsSync(LOCAL_SCREENSHOT_PATH)) {
|
||||
const buffer = readFileSync(LOCAL_SCREENSHOT_PATH);
|
||||
return Buffer.from(buffer).toString("base64");
|
||||
}
|
||||
} catch {
|
||||
console.log("Warning: Screenshot capture failed.");
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Screen State Diffing
|
||||
// ===========================================
|
||||
|
||||
interface ScreenDiff {
|
||||
changed: boolean;
|
||||
addedTexts: string[];
|
||||
removedTexts: string[];
|
||||
summary: string;
|
||||
}
|
||||
|
||||
function diffScreenState(
|
||||
prevElements: UIElement[],
|
||||
currElements: UIElement[]
|
||||
): ScreenDiff {
|
||||
const prevTexts = new Set(prevElements.map((e) => e.text).filter(Boolean));
|
||||
const currTexts = new Set(currElements.map((e) => e.text).filter(Boolean));
|
||||
|
||||
const addedTexts = [...currTexts].filter((t) => !prevTexts.has(t));
|
||||
const removedTexts = [...prevTexts].filter((t) => !currTexts.has(t));
|
||||
|
||||
const prevHash = computeScreenHash(prevElements);
|
||||
const currHash = computeScreenHash(currElements);
|
||||
const changed = prevHash !== currHash;
|
||||
|
||||
let summary = "";
|
||||
if (!changed) {
|
||||
summary = "Screen has NOT changed since last action.";
|
||||
} else {
|
||||
const parts: string[] = [];
|
||||
if (addedTexts.length > 0) {
|
||||
parts.push(`New on screen: ${addedTexts.slice(0, 5).join(", ")}`);
|
||||
}
|
||||
if (removedTexts.length > 0) {
|
||||
parts.push(`Gone from screen: ${removedTexts.slice(0, 5).join(", ")}`);
|
||||
}
|
||||
summary = parts.join(". ") || "Screen layout changed.";
|
||||
}
|
||||
|
||||
return { changed, addedTexts, removedTexts, summary };
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Streaming LLM Consumer
|
||||
// ===========================================
|
||||
|
||||
async function getDecisionStreaming(
|
||||
llm: LLMProvider,
|
||||
messages: ChatMessage[]
|
||||
): Promise<ActionDecision> {
|
||||
if (!Config.STREAMING_ENABLED || !llm.capabilities.supportsStreaming || !llm.getDecisionStream) {
|
||||
return llm.getDecision(messages);
|
||||
}
|
||||
|
||||
let accumulated = "";
|
||||
process.stdout.write("Thinking");
|
||||
for await (const chunk of llm.getDecisionStream(messages)) {
|
||||
accumulated += chunk;
|
||||
process.stdout.write(".");
|
||||
}
|
||||
process.stdout.write("\n");
|
||||
|
||||
return parseJsonResponse(accumulated);
|
||||
}
|
||||
|
||||
/** Simple JSON parser with markdown fallback (duplicated from llm-providers for streaming path) */
|
||||
function parseJsonResponse(text: string): ActionDecision {
|
||||
try {
|
||||
return JSON.parse(text);
|
||||
} catch {
|
||||
const match = text.match(/\{[\s\S]*?\}/);
|
||||
if (match) {
|
||||
try {
|
||||
return JSON.parse(match[0]);
|
||||
} catch {
|
||||
// fall through
|
||||
}
|
||||
}
|
||||
console.log(`Warning: Could not parse streamed response: ${text.slice(0, 200)}`);
|
||||
return { action: "wait", reason: "Failed to parse response, waiting" };
|
||||
}
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Main Agent Loop
|
||||
// ===========================================
|
||||
|
||||
async function runAgent(goal: string, maxSteps?: number): Promise<void> {
|
||||
const steps = maxSteps ?? Config.MAX_STEPS;
|
||||
|
||||
// Phase 1A: Auto-detect screen resolution
|
||||
const resolution = getScreenResolution();
|
||||
if (resolution) {
|
||||
initDeviceContext(resolution);
|
||||
console.log(`Screen resolution: ${resolution[0]}x${resolution[1]}`);
|
||||
} else {
|
||||
console.log("Screen resolution: using default 1080x2400 swipe coords");
|
||||
}
|
||||
|
||||
console.log("Android Action Kernel Started");
|
||||
console.log(`Goal: ${goal}`);
|
||||
console.log(`Provider: ${Config.LLM_PROVIDER} (${Config.getModel()})`);
|
||||
console.log(`Max steps: ${steps} | Step delay: ${Config.STEP_DELAY}s`);
|
||||
console.log(`Vision: ${Config.VISION_MODE} | Streaming: ${Config.STREAMING_ENABLED}`);
|
||||
console.log(`Max elements: ${Config.MAX_ELEMENTS} | History: ${Config.MAX_HISTORY_STEPS} steps`);
|
||||
|
||||
const llm = getLlmProvider();
|
||||
|
||||
// Phase 2B: Session logging
|
||||
const logger = new SessionLogger(
|
||||
Config.LOG_DIR,
|
||||
goal,
|
||||
Config.LLM_PROVIDER,
|
||||
Config.getModel()
|
||||
);
|
||||
|
||||
// Phase 4A: Multi-turn conversation memory
|
||||
const messages: ChatMessage[] = [
|
||||
{ role: "system", content: SYSTEM_PROMPT },
|
||||
];
|
||||
|
||||
let prevElements: UIElement[] = [];
|
||||
let stuckCount = 0;
|
||||
|
||||
for (let step = 0; step < steps; step++) {
|
||||
console.log(`\n--- Step ${step + 1}/${steps} ---`);
|
||||
|
||||
// 1. Perception: Capture screen state
|
||||
console.log("Scanning screen...");
|
||||
const { elements, compactJson: screenContext } = getScreenState();
|
||||
|
||||
// 1B. Foreground app detection
|
||||
const foregroundApp = getForegroundApp();
|
||||
if (foregroundApp) {
|
||||
console.log(`Foreground: ${foregroundApp}`);
|
||||
}
|
||||
|
||||
// 2. Screen diff: detect stuck loops
|
||||
let diffContext = "";
|
||||
let screenChanged = true;
|
||||
if (step > 0) {
|
||||
const diff = diffScreenState(prevElements, elements);
|
||||
screenChanged = diff.changed;
|
||||
diffContext = `\n\nSCREEN_CHANGE: ${diff.summary}`;
|
||||
|
||||
if (!diff.changed) {
|
||||
stuckCount++;
|
||||
console.log(
|
||||
`Warning: Screen unchanged for ${stuckCount} step(s).`
|
||||
);
|
||||
if (stuckCount >= Config.STUCK_THRESHOLD) {
|
||||
console.log(
|
||||
`Stuck for ${stuckCount} steps. Injecting recovery hint.`
|
||||
);
|
||||
diffContext +=
|
||||
`\nWARNING: You have been stuck for ${stuckCount} steps. ` +
|
||||
`The screen is NOT changing. Try a DIFFERENT action: ` +
|
||||
`swipe to scroll, press back, go home, or launch a different app.` +
|
||||
`\nYour plan is not working. Create a NEW plan with a different approach.`;
|
||||
}
|
||||
} else {
|
||||
stuckCount = 0;
|
||||
}
|
||||
}
|
||||
prevElements = elements;
|
||||
|
||||
// 3. Vision: capture screenshot based on VISION_MODE
|
||||
let screenshotBase64: string | null = null;
|
||||
let visionContext = "";
|
||||
|
||||
const shouldCaptureVision =
|
||||
Config.VISION_MODE === "always" ||
|
||||
(Config.VISION_MODE === "fallback" && elements.length === 0);
|
||||
|
||||
if (shouldCaptureVision) {
|
||||
screenshotBase64 = captureScreenshotBase64();
|
||||
if (elements.length === 0) {
|
||||
visionContext =
|
||||
"\n\nVISION_FALLBACK: The accessibility tree returned NO elements. " +
|
||||
"A screenshot has been captured. The screen likely contains custom-drawn " +
|
||||
"content (game, WebView, or Flutter). Try using coordinate-based taps on " +
|
||||
"common UI positions, or use 'back'/'home' to navigate away.";
|
||||
}
|
||||
if (screenshotBase64 && llm.capabilities.supportsImages) {
|
||||
console.log("Sending screenshot to LLM");
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Build user message with all context
|
||||
const foregroundLine = foregroundApp
|
||||
? `FOREGROUND_APP: ${foregroundApp}\n\n`
|
||||
: "";
|
||||
const textContent =
|
||||
`GOAL: ${goal}\n\n${foregroundLine}SCREEN_CONTEXT:\n${screenContext}${diffContext}${visionContext}`;
|
||||
|
||||
// Build content parts (text + optional image)
|
||||
const userContent: ContentPart[] = [{ type: "text", text: textContent }];
|
||||
if (screenshotBase64 && llm.capabilities.supportsImages) {
|
||||
userContent.push({
|
||||
type: "image",
|
||||
base64: screenshotBase64,
|
||||
mimeType: "image/png",
|
||||
});
|
||||
}
|
||||
|
||||
messages.push({ role: "user", content: userContent });
|
||||
|
||||
// Trim messages to keep within history limit
|
||||
const trimmed = trimMessages(messages, Config.MAX_HISTORY_STEPS);
|
||||
|
||||
// 5. Reasoning: Get LLM decision
|
||||
const llmStart = performance.now();
|
||||
let decision: ActionDecision;
|
||||
try {
|
||||
decision = await getDecisionStreaming(llm, trimmed);
|
||||
} catch (err) {
|
||||
console.log(`LLM Error: ${(err as Error).message}`);
|
||||
console.log("Falling back to wait action.");
|
||||
decision = { action: "wait", reason: "LLM request failed, waiting" };
|
||||
}
|
||||
const llmLatency = performance.now() - llmStart;
|
||||
|
||||
// Log thinking and planning
|
||||
if (decision.think) {
|
||||
console.log(`Think: ${decision.think}`);
|
||||
}
|
||||
if (decision.plan) {
|
||||
console.log(`Plan: ${decision.plan.join(" -> ")}`);
|
||||
}
|
||||
if (decision.planProgress) {
|
||||
console.log(`Progress: ${decision.planProgress}`);
|
||||
}
|
||||
console.log(`Decision: ${decision.action} — ${decision.reason ?? "no reason"} (${Math.round(llmLatency)}ms)`);
|
||||
|
||||
// Append assistant response to conversation
|
||||
messages.push({
|
||||
role: "assistant",
|
||||
content: JSON.stringify(decision),
|
||||
});
|
||||
|
||||
// 6. Action: Execute the decision
|
||||
const actionStart = performance.now();
|
||||
let result: ActionResult;
|
||||
try {
|
||||
result = executeAction(decision);
|
||||
} catch (err) {
|
||||
console.log(`Action Error: ${(err as Error).message}`);
|
||||
result = { success: false, message: (err as Error).message };
|
||||
}
|
||||
const actionLatency = performance.now() - actionStart;
|
||||
|
||||
// Log step
|
||||
logger.logStep(
|
||||
step + 1,
|
||||
foregroundApp,
|
||||
elements.length,
|
||||
screenChanged,
|
||||
decision,
|
||||
result,
|
||||
Math.round(llmLatency),
|
||||
Math.round(actionLatency)
|
||||
);
|
||||
|
||||
console.log(`Messages in context: ${trimmed.length}`);
|
||||
|
||||
// 7. Check for goal completion
|
||||
if (decision.action === "done") {
|
||||
console.log("\nTask completed successfully.");
|
||||
logger.finalize(true);
|
||||
return;
|
||||
}
|
||||
|
||||
// Wait for UI to update
|
||||
await Bun.sleep(Config.STEP_DELAY * 1000);
|
||||
}
|
||||
|
||||
console.log("\nMax steps reached. Task may be incomplete.");
|
||||
logger.finalize(false);
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Entry Point
|
||||
// ===========================================
|
||||
|
||||
async function main(): Promise<void> {
|
||||
try {
|
||||
Config.validate();
|
||||
} catch (e) {
|
||||
console.log(`Configuration Error: ${(e as Error).message}`);
|
||||
return;
|
||||
}
|
||||
|
||||
// Read user input from stdin
|
||||
process.stdout.write("Enter your goal: ");
|
||||
const goal = await new Promise<string>((resolve) => {
|
||||
const reader = Bun.stdin.stream().getReader();
|
||||
reader.read().then(({ value }) => {
|
||||
resolve(new TextDecoder().decode(value).trim());
|
||||
reader.releaseLock();
|
||||
});
|
||||
});
|
||||
|
||||
if (!goal) {
|
||||
console.log("No goal provided. Exiting.");
|
||||
return;
|
||||
}
|
||||
|
||||
await runAgent(goal);
|
||||
}
|
||||
|
||||
main();
|
||||
Reference in New Issue
Block a user