Flatten project structure: move android-action-kernel/ to root

Removes the unnecessary nesting — all source, config, and docs now live at the project root for simpler paths and commands. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 16:02:40 +05:30
parent 610fd04818
commit 879509aebc
16 changed files with 862 additions and 7 deletions
@@ -0,0 +1,390 @@
+/**
+ * Action execution module for Android Action Kernel.
+ * Handles all ADB commands for interacting with Android devices.
+ *
+ * Supported actions:
+ *   tap, type, enter, swipe, home, back, wait, done,
+ *   longpress, screenshot, launch, clear, clipboard_get, clipboard_set, shell
+ */
+
+import { Config } from "./config.js";
+import {
+  KEYCODE_ENTER,
+  KEYCODE_HOME,
+  KEYCODE_BACK,
+  KEYCODE_DEL,
+  KEYCODE_MOVE_HOME,
+  KEYCODE_MOVE_END,
+  SWIPE_COORDS,
+  SWIPE_DURATION_MS,
+  LONG_PRESS_DURATION_MS,
+  DEVICE_SCREENSHOT_PATH,
+  LOCAL_SCREENSHOT_PATH,
+  computeSwipeCoords,
+} from "./constants.js";
+
+export interface ActionDecision {
+  action: string;
+  coordinates?: [number, number];
+  text?: string;
+  direction?: string;
+  reason?: string;
+  // launch action
+  package?: string;
+  activity?: string;
+  uri?: string;
+  extras?: Record<string, string>;
+  // shell action
+  command?: string;
+  // screenshot action
+  filename?: string;
+  // planning fields (Phase 4B)
+  think?: string;
+  plan?: string[];
+  planProgress?: string;
+}
+
+export interface ActionResult {
+  success: boolean;
+  message: string;
+  data?: string;
+}
+
+/**
+ * Executes a shell command via ADB with retry support.
+ */
+export function runAdbCommand(command: string[], retries = Config.MAX_RETRIES): string {
+  for (let attempt = 0; attempt <= retries; attempt++) {
+    const result = Bun.spawnSync([Config.ADB_PATH, ...command], {
+      stdout: "pipe",
+      stderr: "pipe",
+    });
+
+    const stdout = result.stdout.toString().trim();
+    const stderr = result.stderr.toString().trim();
+
+    if (stderr && stderr.toLowerCase().includes("error")) {
+      if (attempt < retries) {
+        const delay = Math.pow(2, attempt) * 1000;
+        console.log(`ADB Error (attempt ${attempt + 1}/${retries + 1}): ${stderr}`);
+        console.log(`Retrying in ${delay / 1000}s...`);
+        Bun.sleepSync(delay);
+        continue;
+      }
+      console.log(`ADB Error (all retries exhausted): ${stderr}`);
+    }
+
+    return stdout;
+  }
+
+  return "";
+}
+
+// ===========================================
+// Device Intelligence (Phase 1)
+// ===========================================
+
+/** Module-level dynamic swipe coords, set by initDeviceContext() */
+let dynamicSwipeCoords: Record<string, [number, number, number, number]> | null = null;
+
+/**
+ * Detects the connected device's screen resolution via ADB.
+ * Returns [width, height] or null on failure.
+ */
+export function getScreenResolution(): [number, number] | null {
+  try {
+    const output = runAdbCommand(["shell", "wm", "size"]);
+    // Try "Override size:" first, then "Physical size:"
+    const overrideMatch = output.match(/Override size:\s*(\d+)x(\d+)/);
+    if (overrideMatch) {
+      return [parseInt(overrideMatch[1], 10), parseInt(overrideMatch[2], 10)];
+    }
+    const physicalMatch = output.match(/Physical size:\s*(\d+)x(\d+)/);
+    if (physicalMatch) {
+      return [parseInt(physicalMatch[1], 10), parseInt(physicalMatch[2], 10)];
+    }
+  } catch {
+    console.log("Warning: Could not detect screen resolution.");
+  }
+  return null;
+}
+
+/**
+ * Detects the currently running foreground app.
+ * Returns "package/activity" or null on failure.
+ */
+export function getForegroundApp(): string | null {
+  try {
+    const output = runAdbCommand([
+      "shell", "dumpsys", "activity", "activities",
+    ]);
+    // Match mResumedActivity line
+    const match = output.match(/mResumedActivity.*?(\S+\/\S+)/);
+    if (match) {
+      return match[1].replace("}", "");
+    }
+  } catch {
+    // ignore
+  }
+  return null;
+}
+
+/**
+ * Stores dynamic swipe coordinates based on detected resolution.
+ * Must be called once at startup.
+ */
+export function initDeviceContext(resolution: [number, number]): void {
+  dynamicSwipeCoords = computeSwipeCoords(resolution[0], resolution[1]);
+}
+
+/** Returns dynamic swipe coords if set, otherwise falls back to hardcoded defaults. */
+function getSwipeCoords(): Record<string, [number, number, number, number]> {
+  return dynamicSwipeCoords ?? SWIPE_COORDS;
+}
+
+/**
+ * Executes the action decided by the LLM. Returns a result for the kernel to track.
+ */
+export function executeAction(action: ActionDecision): ActionResult {
+  switch (action.action) {
+    case "tap":
+      return executeTap(action);
+    case "type":
+      return executeType(action);
+    case "enter":
+      return executeEnter();
+    case "swipe":
+      return executeSwipe(action);
+    case "home":
+      return executeHome();
+    case "back":
+      return executeBack();
+    case "wait":
+      return executeWait();
+    case "done":
+      return executeDone(action);
+    case "longpress":
+      return executeLongPress(action);
+    case "screenshot":
+      return executeScreenshot(action);
+    case "launch":
+      return executeLaunch(action);
+    case "clear":
+      return executeClear();
+    case "clipboard_get":
+      return executeClipboardGet();
+    case "clipboard_set":
+      return executeClipboardSet(action);
+    case "shell":
+      return executeShell(action);
+    default:
+      console.log(`Warning: Unknown action: ${action.action}`);
+      return { success: false, message: `Unknown action: ${action.action}` };
+  }
+}
+
+// ===========================================
+// Original actions (enhanced)
+// ===========================================
+
+function executeTap(action: ActionDecision): ActionResult {
+  const [x, y] = action.coordinates ?? [0, 0];
+  console.log(`Tapping: (${x}, ${y})`);
+  runAdbCommand(["shell", "input", "tap", String(x), String(y)]);
+  return { success: true, message: `Tapped (${x}, ${y})` };
+}
+
+function executeType(action: ActionDecision): ActionResult {
+  const text = action.text ?? "";
+  if (!text) return { success: false, message: "No text to type" };
+  // ADB requires %s for spaces, escape special shell characters
+  const escapedText = text
+    .replaceAll("\\", "\\\\")
+    .replaceAll("\"", "\\\"")
+    .replaceAll("'", "\\'")
+    .replaceAll(" ", "%s")
+    .replaceAll("&", "\\&")
+    .replaceAll("|", "\\|")
+    .replaceAll(";", "\\;")
+    .replaceAll("(", "\\(")
+    .replaceAll(")", "\\)")
+    .replaceAll("<", "\\<")
+    .replaceAll(">", "\\>");
+  console.log(`Typing: ${text}`);
+  runAdbCommand(["shell", "input", "text", escapedText]);
+  return { success: true, message: `Typed "${text}"` };
+}
+
+function executeEnter(): ActionResult {
+  console.log("Pressing Enter");
+  runAdbCommand(["shell", "input", "keyevent", KEYCODE_ENTER]);
+  return { success: true, message: "Pressed Enter" };
+}
+
+function executeSwipe(action: ActionDecision): ActionResult {
+  const direction = action.direction ?? "up";
+  const swipeCoords = getSwipeCoords();
+  const coords = swipeCoords[direction] ?? swipeCoords["up"];
+
+  console.log(`Swiping ${direction}`);
+  runAdbCommand([
+    "shell", "input", "swipe",
+    String(coords[0]), String(coords[1]),
+    String(coords[2]), String(coords[3]),
+    SWIPE_DURATION_MS,
+  ]);
+  return { success: true, message: `Swiped ${direction}` };
+}
+
+function executeHome(): ActionResult {
+  console.log("Going Home");
+  runAdbCommand(["shell", "input", "keyevent", KEYCODE_HOME]);
+  return { success: true, message: "Went to home screen" };
+}
+
+function executeBack(): ActionResult {
+  console.log("Going Back");
+  runAdbCommand(["shell", "input", "keyevent", KEYCODE_BACK]);
+  return { success: true, message: "Went back" };
+}
+
+function executeWait(): ActionResult {
+  console.log("Waiting...");
+  Bun.sleepSync(2000);
+  return { success: true, message: "Waited 2s" };
+}
+
+function executeDone(action: ActionDecision): ActionResult {
+  console.log(`Goal Achieved: ${action.reason ?? "Task complete"}`);
+  return { success: true, message: "done" };
+}
+
+// ===========================================
+// New actions
+// ===========================================
+
+/**
+ * Long press at coordinates (opens context menus, triggers drag mode, etc.)
+ */
+function executeLongPress(action: ActionDecision): ActionResult {
+  const [x, y] = action.coordinates ?? [0, 0];
+  console.log(`Long pressing: (${x}, ${y})`);
+  // A swipe from the same point to the same point with long duration = long press
+  runAdbCommand([
+    "shell", "input", "swipe",
+    String(x), String(y), String(x), String(y),
+    LONG_PRESS_DURATION_MS,
+  ]);
+  return { success: true, message: `Long pressed (${x}, ${y})` };
+}
+
+/**
+ * Captures a screenshot and saves it locally.
+ */
+function executeScreenshot(action: ActionDecision): ActionResult {
+  const filename = action.filename ?? LOCAL_SCREENSHOT_PATH;
+  console.log(`Taking screenshot → ${filename}`);
+  runAdbCommand(["shell", "screencap", "-p", DEVICE_SCREENSHOT_PATH]);
+  runAdbCommand(["pull", DEVICE_SCREENSHOT_PATH, filename]);
+  return { success: true, message: `Screenshot saved to ${filename}`, data: filename };
+}
+
+/**
+ * Launches an app by package name, activity, or URI intent.
+ *
+ * Examples the LLM can produce:
+ *   { action: "launch", package: "com.whatsapp" }
+ *   { action: "launch", package: "com.whatsapp", activity: ".HomeActivity" }
+ *   { action: "launch", uri: "https://maps.google.com/?q=pizza+near+me" }
+ *   { action: "launch", package: "com.whatsapp", uri: "content://media/external/images/1",
+ *     extras: { "android.intent.extra.TEXT": "Check this out" } }
+ */
+function executeLaunch(action: ActionDecision): ActionResult {
+  const args: string[] = ["shell", "am", "start"];
+
+  if (action.uri) {
+    args.push("-a", "android.intent.action.VIEW");
+    args.push("-d", action.uri);
+  }
+
+  if (action.package && action.activity) {
+    args.push("-n", `${action.package}/${action.activity}`);
+  } else if (action.package) {
+    // Launch the default activity for the package
+    const launchResult = runAdbCommand([
+      "shell", "monkey", "-p", action.package, "-c",
+      "android.intent.category.LAUNCHER", "1",
+    ]);
+    console.log(`Launching: ${action.package}`);
+    return { success: true, message: `Launched ${action.package}`, data: launchResult };
+  }
+
+  // Attach intent extras
+  if (action.extras) {
+    for (const [key, value] of Object.entries(action.extras)) {
+      args.push("--es", key, value);
+    }
+  }
+
+  const label = action.package ?? action.uri ?? "intent";
+  console.log(`Launching: ${label}`);
+  const result = runAdbCommand(args);
+  return { success: true, message: `Launched ${label}`, data: result };
+}
+
+/**
+ * Clears the currently focused text field.
+ * Selects all text then deletes it.
+ */
+function executeClear(): ActionResult {
+  console.log("Clearing text field");
+  // Move to end of field
+  runAdbCommand(["shell", "input", "keyevent", KEYCODE_MOVE_END]);
+  // Select all: Shift+Home
+  runAdbCommand(["shell", "input", "keyevent", "--longpress", KEYCODE_MOVE_HOME]);
+  // Delete selected text
+  runAdbCommand(["shell", "input", "keyevent", KEYCODE_DEL]);
+  return { success: true, message: "Cleared text field" };
+}
+
+/**
+ * Reads the current clipboard contents.
+ */
+function executeClipboardGet(): ActionResult {
+  console.log("Reading clipboard");
+  // Use am broadcast to get clipboard via a helper or service log
+  // On Android 10+, direct clipboard access via ADB is restricted.
+  // Workaround: dump the clipboard service log
+  const result = runAdbCommand(["shell", "cmd", "clipboard", "get-text"]);
+  if (result) {
+    console.log(`Clipboard: ${result.slice(0, 100)}`);
+    return { success: true, message: `Clipboard: ${result}`, data: result };
+  }
+  // Fallback for older Android versions
+  const fallback = runAdbCommand([
+    "shell", "service", "call", "clipboard", "2", "i32", "1",
+  ]);
+  return { success: true, message: `Clipboard (raw): ${fallback}`, data: fallback };
+}
+
+/**
+ * Sets the clipboard to the given text.
+ */
+function executeClipboardSet(action: ActionDecision): ActionResult {
+  const text = action.text ?? "";
+  if (!text) return { success: false, message: "No text to set on clipboard" };
+  console.log(`Setting clipboard: ${text.slice(0, 50)}...`);
+  runAdbCommand(["shell", "cmd", "clipboard", "set-text", text]);
+  return { success: true, message: `Clipboard set to "${text.slice(0, 50)}"` };
+}
+
+/**
+ * Runs an arbitrary ADB shell command. Use sparingly for edge cases.
+ */
+function executeShell(action: ActionDecision): ActionResult {
+  const cmd = action.command ?? "";
+  if (!cmd) return { success: false, message: "No command provided" };
+  console.log(`Shell: ${cmd}`);
+  const result = runAdbCommand(["shell", ...cmd.split(" ")]);
+  return { success: true, message: `Shell output: ${result.slice(0, 200)}`, data: result };
+}
@@ -0,0 +1,99 @@
+/**
+ * Configuration management for Android Action Kernel.
+ * Bun natively loads .env files — no dotenv needed.
+ */
+
+import {
+  DEVICE_DUMP_PATH,
+  LOCAL_DUMP_PATH,
+  DEVICE_SCREENSHOT_PATH,
+  LOCAL_SCREENSHOT_PATH,
+  DEFAULT_MAX_STEPS,
+  DEFAULT_STEP_DELAY,
+  DEFAULT_GROQ_MODEL,
+  DEFAULT_OPENAI_MODEL,
+  DEFAULT_BEDROCK_MODEL,
+  DEFAULT_MAX_RETRIES,
+  DEFAULT_STUCK_THRESHOLD,
+  DEFAULT_MAX_ELEMENTS,
+  DEFAULT_LOG_DIR,
+  DEFAULT_VISION_MODE,
+  DEFAULT_MAX_HISTORY_STEPS,
+  DEFAULT_STREAMING_ENABLED,
+  type VisionMode,
+} from "./constants.js";
+
+function env(key: string, fallback = ""): string {
+  return process.env[key] ?? fallback;
+}
+
+export const Config = {
+  // ADB Configuration
+  ADB_PATH: env("ADB_PATH", "adb"),
+  SCREEN_DUMP_PATH: DEVICE_DUMP_PATH,
+  LOCAL_DUMP_PATH: LOCAL_DUMP_PATH,
+  DEVICE_SCREENSHOT_PATH: DEVICE_SCREENSHOT_PATH,
+  LOCAL_SCREENSHOT_PATH: LOCAL_SCREENSHOT_PATH,
+
+  // Agent Configuration
+  MAX_STEPS: parseInt(env("MAX_STEPS", String(DEFAULT_MAX_STEPS)), 10),
+  STEP_DELAY: parseFloat(env("STEP_DELAY", String(DEFAULT_STEP_DELAY))),
+  MAX_RETRIES: parseInt(env("MAX_RETRIES", String(DEFAULT_MAX_RETRIES)), 10),
+  STUCK_THRESHOLD: parseInt(env("STUCK_THRESHOLD", String(DEFAULT_STUCK_THRESHOLD)), 10),
+
+  // Vision mode: "off" | "fallback" (only when tree empty) | "always" (every step)
+  VISION_MODE: (env("VISION_MODE", DEFAULT_VISION_MODE) as VisionMode),
+
+  // Smart element filtering
+  MAX_ELEMENTS: parseInt(env("MAX_ELEMENTS", String(DEFAULT_MAX_ELEMENTS)), 10),
+
+  // Session logging
+  LOG_DIR: env("LOG_DIR", DEFAULT_LOG_DIR),
+
+  // Multi-turn memory
+  MAX_HISTORY_STEPS: parseInt(env("MAX_HISTORY_STEPS", String(DEFAULT_MAX_HISTORY_STEPS)), 10),
+
+  // Streaming responses
+  STREAMING_ENABLED: env("STREAMING_ENABLED", String(DEFAULT_STREAMING_ENABLED)) === "true",
+
+  // LLM Provider: "groq", "openai", "bedrock", or "openrouter"
+  LLM_PROVIDER: env("LLM_PROVIDER", "groq"),
+
+  // Groq Configuration
+  GROQ_API_KEY: env("GROQ_API_KEY"),
+  GROQ_MODEL: env("GROQ_MODEL", DEFAULT_GROQ_MODEL),
+
+  // OpenAI Configuration
+  OPENAI_API_KEY: env("OPENAI_API_KEY"),
+  OPENAI_MODEL: env("OPENAI_MODEL", DEFAULT_OPENAI_MODEL),
+
+  // AWS Bedrock Configuration
+  AWS_REGION: env("AWS_REGION", "us-east-1"),
+  BEDROCK_MODEL: env("BEDROCK_MODEL", DEFAULT_BEDROCK_MODEL),
+
+  // OpenRouter Configuration (via Vercel AI SDK)
+  OPENROUTER_API_KEY: env("OPENROUTER_API_KEY"),
+  OPENROUTER_MODEL: env("OPENROUTER_MODEL", "anthropic/claude-3.5-sonnet"),
+
+  getModel(): string {
+    const provider = Config.LLM_PROVIDER;
+    if (provider === "groq") return Config.GROQ_MODEL;
+    if (provider === "bedrock") return Config.BEDROCK_MODEL;
+    if (provider === "openrouter") return Config.OPENROUTER_MODEL;
+    return Config.OPENAI_MODEL;
+  },
+
+  validate(): void {
+    const provider = Config.LLM_PROVIDER;
+    if (provider === "groq" && !Config.GROQ_API_KEY) {
+      throw new Error("GROQ_API_KEY is required when using Groq provider");
+    }
+    if (provider === "openai" && !Config.OPENAI_API_KEY) {
+      throw new Error("OPENAI_API_KEY is required when using OpenAI provider");
+    }
+    if (provider === "openrouter" && !Config.OPENROUTER_API_KEY) {
+      throw new Error("OPENROUTER_API_KEY is required when using OpenRouter provider");
+    }
+    // Bedrock uses AWS credential chain, no explicit validation needed
+  },
+};
@@ -0,0 +1,118 @@
+/**
+ * Constants for Android Action Kernel.
+ * All magic strings, URLs, and fixed values in one place.
+ */
+
+// ===========================================
+// API Endpoints
+// ===========================================
+export const GROQ_API_BASE_URL = "https://api.groq.com/openai/v1";
+
+// ===========================================
+// ADB Key Codes
+// ===========================================
+export const KEYCODE_ENTER = "66";
+export const KEYCODE_HOME = "KEYCODE_HOME";
+export const KEYCODE_BACK = "KEYCODE_BACK";
+export const KEYCODE_DEL = "67";
+export const KEYCODE_FORWARD_DEL = "112";
+export const KEYCODE_MOVE_HOME = "122";
+export const KEYCODE_MOVE_END = "123";
+export const KEYCODE_MENU = "82";
+export const KEYCODE_TAB = "61";
+export const KEYCODE_ESCAPE = "111";
+export const KEYCODE_DPAD_UP = "19";
+export const KEYCODE_DPAD_DOWN = "20";
+export const KEYCODE_DPAD_LEFT = "21";
+export const KEYCODE_DPAD_RIGHT = "22";
+export const KEYCODE_VOLUME_UP = "24";
+export const KEYCODE_VOLUME_DOWN = "25";
+export const KEYCODE_POWER = "26";
+
+// ===========================================
+// Default Screen Coordinates (for swipe actions)
+// Adjust based on target device resolution
+// ===========================================
+export const SCREEN_CENTER_X = 540;
+export const SCREEN_CENTER_Y = 1200;
+
+// Swipe coordinates: [start_x, start_y, end_x, end_y]
+// These are the fallback values for 1080x2400 screens
+export const SWIPE_COORDS: Record<string, [number, number, number, number]> = {
+  up: [SCREEN_CENTER_X, 1500, SCREEN_CENTER_X, 500],
+  down: [SCREEN_CENTER_X, 500, SCREEN_CENTER_X, 1500],
+  left: [800, SCREEN_CENTER_Y, 200, SCREEN_CENTER_Y],
+  right: [200, SCREEN_CENTER_Y, 800, SCREEN_CENTER_Y],
+};
+
+/**
+ * Derives swipe coordinates from actual screen dimensions using ratios
+ * from the hardcoded 1080x2400 reference values.
+ */
+export function computeSwipeCoords(
+  width: number,
+  height: number
+): Record<string, [number, number, number, number]> {
+  const cx = Math.floor(width / 2);
+  const cy = Math.floor(height / 2);
+  // Vertical swipe: from 62.5% to 20.8% of height (mirrors 1500→500 on 2400h)
+  const vTop = Math.floor(height * 0.208);
+  const vBottom = Math.floor(height * 0.625);
+  // Horizontal swipe: from 74% to 18.5% of width (mirrors 800→200 on 1080w)
+  const hLeft = Math.floor(width * 0.185);
+  const hRight = Math.floor(width * 0.741);
+
+  return {
+    up: [cx, vBottom, cx, vTop],
+    down: [cx, vTop, cx, vBottom],
+    left: [hRight, cy, hLeft, cy],
+    right: [hLeft, cy, hRight, cy],
+  };
+}
+export const SWIPE_DURATION_MS = "300";
+export const LONG_PRESS_DURATION_MS = "1000";
+
+// ===========================================
+// Default Models
+// ===========================================
+export const DEFAULT_GROQ_MODEL = "llama-3.3-70b-versatile";
+export const DEFAULT_OPENAI_MODEL = "gpt-4o";
+export const DEFAULT_BEDROCK_MODEL = "us.meta.llama3-3-70b-instruct-v1:0";
+export const DEFAULT_OPENROUTER_MODEL = "anthropic/claude-3.5-sonnet";
+
+// ===========================================
+// Bedrock Model Identifiers
+// ===========================================
+export const BEDROCK_ANTHROPIC_MODELS = ["anthropic"];
+export const BEDROCK_META_MODELS = ["meta", "llama"];
+
+// ===========================================
+// File Paths
+// ===========================================
+export const DEVICE_DUMP_PATH = "/sdcard/window_dump.xml";
+export const LOCAL_DUMP_PATH = "window_dump.xml";
+export const DEVICE_SCREENSHOT_PATH = "/sdcard/kernel_screenshot.png";
+export const LOCAL_SCREENSHOT_PATH = "kernel_screenshot.png";
+
+// ===========================================
+// Agent Defaults
+// ===========================================
+export const DEFAULT_MAX_STEPS = 30;
+export const DEFAULT_STEP_DELAY = 2.0;
+export const DEFAULT_MAX_RETRIES = 3;
+export const DEFAULT_STUCK_THRESHOLD = 3;
+export const DEFAULT_VISION_ENABLED = true;
+
+// Phase 2: Context Quality
+export const DEFAULT_MAX_ELEMENTS = 40;
+export const DEFAULT_LOG_DIR = "logs";
+
+// Phase 3: Vision Mode
+export type VisionMode = "off" | "fallback" | "always";
+export const DEFAULT_VISION_MODE: VisionMode = "fallback";
+
+// Phase 4: Multi-turn Memory
+export const DEFAULT_MAX_HISTORY_STEPS = 10;
+
+// Phase 5: Streaming
+export const DEFAULT_STREAMING_ENABLED = true;
@@ -0,0 +1,416 @@
+/**
+ * Android Action Kernel - Main Agent Loop (TypeScript/Bun Edition)
+ *
+ * An AI agent that controls Android devices through the accessibility API.
+ * Uses LLMs to make decisions based on screen context.
+ *
+ * Features:
+ *   - Perception -> Reasoning -> Action loop
+ *   - Screen state diffing (stuck loop detection)
+ *   - Error recovery with retries
+ *   - Vision fallback & always-on multimodal screenshots
+ *   - Dynamic early exit on goal completion
+ *   - Smart element filtering (compact JSON, top-N scoring)
+ *   - Multi-turn conversation memory
+ *   - Multi-step planning (think/plan/planProgress)
+ *   - Streaming LLM responses
+ *   - Session logging with crash-safe partial writes
+ *   - Auto-detect screen resolution & foreground app
+ *   - 15 actions: tap, type, enter, swipe, home, back, wait, done,
+ *     longpress, screenshot, launch, clear, clipboard_get, clipboard_set, shell
+ *
+ * Usage:
+ *     bun run src/kernel.ts
+ */
+
+import { existsSync, readFileSync } from "fs";
+
+import { Config } from "./config.js";
+import {
+  executeAction,
+  runAdbCommand,
+  getScreenResolution,
+  getForegroundApp,
+  initDeviceContext,
+  type ActionDecision,
+  type ActionResult,
+} from "./actions.js";
+import {
+  getLlmProvider,
+  trimMessages,
+  SYSTEM_PROMPT,
+  type LLMProvider,
+  type ChatMessage,
+  type ContentPart,
+} from "./llm-providers.js";
+import {
+  getInteractiveElements,
+  computeScreenHash,
+  filterElements,
+  type UIElement,
+} from "./sanitizer.js";
+import {
+  DEVICE_SCREENSHOT_PATH,
+  LOCAL_SCREENSHOT_PATH,
+} from "./constants.js";
+import { SessionLogger } from "./logger.js";
+
+// ===========================================
+// Screen Perception
+// ===========================================
+
+interface ScreenState {
+  elements: UIElement[];
+  compactJson: string;
+}
+
+/**
+ * Dumps the current UI XML and returns parsed elements + compact filtered JSON for the LLM.
+ */
+function getScreenState(): ScreenState {
+  try {
+    runAdbCommand(["shell", "uiautomator", "dump", Config.SCREEN_DUMP_PATH]);
+    runAdbCommand(["pull", Config.SCREEN_DUMP_PATH, Config.LOCAL_DUMP_PATH]);
+  } catch {
+    console.log("Warning: ADB screen capture failed.");
+    return { elements: [], compactJson: "Error: Could not capture screen." };
+  }
+
+  if (!existsSync(Config.LOCAL_DUMP_PATH)) {
+    return { elements: [], compactJson: "Error: Could not capture screen." };
+  }
+
+  const xmlContent = readFileSync(Config.LOCAL_DUMP_PATH, "utf-8");
+  const elements = getInteractiveElements(xmlContent);
+  const compact = filterElements(elements, Config.MAX_ELEMENTS);
+  return { elements, compactJson: JSON.stringify(compact) };
+}
+
+/**
+ * Captures a screenshot and returns the base64-encoded PNG, or null on failure.
+ */
+function captureScreenshotBase64(): string | null {
+  try {
+    runAdbCommand(["shell", "screencap", "-p", DEVICE_SCREENSHOT_PATH]);
+    runAdbCommand(["pull", DEVICE_SCREENSHOT_PATH, LOCAL_SCREENSHOT_PATH]);
+    if (existsSync(LOCAL_SCREENSHOT_PATH)) {
+      const buffer = readFileSync(LOCAL_SCREENSHOT_PATH);
+      return Buffer.from(buffer).toString("base64");
+    }
+  } catch {
+    console.log("Warning: Screenshot capture failed.");
+  }
+  return null;
+}
+
+// ===========================================
+// Screen State Diffing
+// ===========================================
+
+interface ScreenDiff {
+  changed: boolean;
+  addedTexts: string[];
+  removedTexts: string[];
+  summary: string;
+}
+
+function diffScreenState(
+  prevElements: UIElement[],
+  currElements: UIElement[]
+): ScreenDiff {
+  const prevTexts = new Set(prevElements.map((e) => e.text).filter(Boolean));
+  const currTexts = new Set(currElements.map((e) => e.text).filter(Boolean));
+
+  const addedTexts = [...currTexts].filter((t) => !prevTexts.has(t));
+  const removedTexts = [...prevTexts].filter((t) => !currTexts.has(t));
+
+  const prevHash = computeScreenHash(prevElements);
+  const currHash = computeScreenHash(currElements);
+  const changed = prevHash !== currHash;
+
+  let summary = "";
+  if (!changed) {
+    summary = "Screen has NOT changed since last action.";
+  } else {
+    const parts: string[] = [];
+    if (addedTexts.length > 0) {
+      parts.push(`New on screen: ${addedTexts.slice(0, 5).join(", ")}`);
+    }
+    if (removedTexts.length > 0) {
+      parts.push(`Gone from screen: ${removedTexts.slice(0, 5).join(", ")}`);
+    }
+    summary = parts.join(". ") || "Screen layout changed.";
+  }
+
+  return { changed, addedTexts, removedTexts, summary };
+}
+
+// ===========================================
+// Streaming LLM Consumer
+// ===========================================
+
+async function getDecisionStreaming(
+  llm: LLMProvider,
+  messages: ChatMessage[]
+): Promise<ActionDecision> {
+  if (!Config.STREAMING_ENABLED || !llm.capabilities.supportsStreaming || !llm.getDecisionStream) {
+    return llm.getDecision(messages);
+  }
+
+  let accumulated = "";
+  process.stdout.write("Thinking");
+  for await (const chunk of llm.getDecisionStream(messages)) {
+    accumulated += chunk;
+    process.stdout.write(".");
+  }
+  process.stdout.write("\n");
+
+  return parseJsonResponse(accumulated);
+}
+
+/** Simple JSON parser with markdown fallback (duplicated from llm-providers for streaming path) */
+function parseJsonResponse(text: string): ActionDecision {
+  try {
+    return JSON.parse(text);
+  } catch {
+    const match = text.match(/\{[\s\S]*?\}/);
+    if (match) {
+      try {
+        return JSON.parse(match[0]);
+      } catch {
+        // fall through
+      }
+    }
+    console.log(`Warning: Could not parse streamed response: ${text.slice(0, 200)}`);
+    return { action: "wait", reason: "Failed to parse response, waiting" };
+  }
+}
+
+// ===========================================
+// Main Agent Loop
+// ===========================================
+
+async function runAgent(goal: string, maxSteps?: number): Promise<void> {
+  const steps = maxSteps ?? Config.MAX_STEPS;
+
+  // Phase 1A: Auto-detect screen resolution
+  const resolution = getScreenResolution();
+  if (resolution) {
+    initDeviceContext(resolution);
+    console.log(`Screen resolution: ${resolution[0]}x${resolution[1]}`);
+  } else {
+    console.log("Screen resolution: using default 1080x2400 swipe coords");
+  }
+
+  console.log("Android Action Kernel Started");
+  console.log(`Goal: ${goal}`);
+  console.log(`Provider: ${Config.LLM_PROVIDER} (${Config.getModel()})`);
+  console.log(`Max steps: ${steps} | Step delay: ${Config.STEP_DELAY}s`);
+  console.log(`Vision: ${Config.VISION_MODE} | Streaming: ${Config.STREAMING_ENABLED}`);
+  console.log(`Max elements: ${Config.MAX_ELEMENTS} | History: ${Config.MAX_HISTORY_STEPS} steps`);
+
+  const llm = getLlmProvider();
+
+  // Phase 2B: Session logging
+  const logger = new SessionLogger(
+    Config.LOG_DIR,
+    goal,
+    Config.LLM_PROVIDER,
+    Config.getModel()
+  );
+
+  // Phase 4A: Multi-turn conversation memory
+  const messages: ChatMessage[] = [
+    { role: "system", content: SYSTEM_PROMPT },
+  ];
+
+  let prevElements: UIElement[] = [];
+  let stuckCount = 0;
+
+  for (let step = 0; step < steps; step++) {
+    console.log(`\n--- Step ${step + 1}/${steps} ---`);
+
+    // 1. Perception: Capture screen state
+    console.log("Scanning screen...");
+    const { elements, compactJson: screenContext } = getScreenState();
+
+    // 1B. Foreground app detection
+    const foregroundApp = getForegroundApp();
+    if (foregroundApp) {
+      console.log(`Foreground: ${foregroundApp}`);
+    }
+
+    // 2. Screen diff: detect stuck loops
+    let diffContext = "";
+    let screenChanged = true;
+    if (step > 0) {
+      const diff = diffScreenState(prevElements, elements);
+      screenChanged = diff.changed;
+      diffContext = `\n\nSCREEN_CHANGE: ${diff.summary}`;
+
+      if (!diff.changed) {
+        stuckCount++;
+        console.log(
+          `Warning: Screen unchanged for ${stuckCount} step(s).`
+        );
+        if (stuckCount >= Config.STUCK_THRESHOLD) {
+          console.log(
+            `Stuck for ${stuckCount} steps. Injecting recovery hint.`
+          );
+          diffContext +=
+            `\nWARNING: You have been stuck for ${stuckCount} steps. ` +
+            `The screen is NOT changing. Try a DIFFERENT action: ` +
+            `swipe to scroll, press back, go home, or launch a different app.` +
+            `\nYour plan is not working. Create a NEW plan with a different approach.`;
+        }
+      } else {
+        stuckCount = 0;
+      }
+    }
+    prevElements = elements;
+
+    // 3. Vision: capture screenshot based on VISION_MODE
+    let screenshotBase64: string | null = null;
+    let visionContext = "";
+
+    const shouldCaptureVision =
+      Config.VISION_MODE === "always" ||
+      (Config.VISION_MODE === "fallback" && elements.length === 0);
+
+    if (shouldCaptureVision) {
+      screenshotBase64 = captureScreenshotBase64();
+      if (elements.length === 0) {
+        visionContext =
+          "\n\nVISION_FALLBACK: The accessibility tree returned NO elements. " +
+          "A screenshot has been captured. The screen likely contains custom-drawn " +
+          "content (game, WebView, or Flutter). Try using coordinate-based taps on " +
+          "common UI positions, or use 'back'/'home' to navigate away.";
+      }
+      if (screenshotBase64 && llm.capabilities.supportsImages) {
+        console.log("Sending screenshot to LLM");
+      }
+    }
+
+    // 4. Build user message with all context
+    const foregroundLine = foregroundApp
+      ? `FOREGROUND_APP: ${foregroundApp}\n\n`
+      : "";
+    const textContent =
+      `GOAL: ${goal}\n\n${foregroundLine}SCREEN_CONTEXT:\n${screenContext}${diffContext}${visionContext}`;
+
+    // Build content parts (text + optional image)
+    const userContent: ContentPart[] = [{ type: "text", text: textContent }];
+    if (screenshotBase64 && llm.capabilities.supportsImages) {
+      userContent.push({
+        type: "image",
+        base64: screenshotBase64,
+        mimeType: "image/png",
+      });
+    }
+
+    messages.push({ role: "user", content: userContent });
+
+    // Trim messages to keep within history limit
+    const trimmed = trimMessages(messages, Config.MAX_HISTORY_STEPS);
+
+    // 5. Reasoning: Get LLM decision
+    const llmStart = performance.now();
+    let decision: ActionDecision;
+    try {
+      decision = await getDecisionStreaming(llm, trimmed);
+    } catch (err) {
+      console.log(`LLM Error: ${(err as Error).message}`);
+      console.log("Falling back to wait action.");
+      decision = { action: "wait", reason: "LLM request failed, waiting" };
+    }
+    const llmLatency = performance.now() - llmStart;
+
+    // Log thinking and planning
+    if (decision.think) {
+      console.log(`Think: ${decision.think}`);
+    }
+    if (decision.plan) {
+      console.log(`Plan: ${decision.plan.join(" -> ")}`);
+    }
+    if (decision.planProgress) {
+      console.log(`Progress: ${decision.planProgress}`);
+    }
+    console.log(`Decision: ${decision.action} — ${decision.reason ?? "no reason"} (${Math.round(llmLatency)}ms)`);
+
+    // Append assistant response to conversation
+    messages.push({
+      role: "assistant",
+      content: JSON.stringify(decision),
+    });
+
+    // 6. Action: Execute the decision
+    const actionStart = performance.now();
+    let result: ActionResult;
+    try {
+      result = executeAction(decision);
+    } catch (err) {
+      console.log(`Action Error: ${(err as Error).message}`);
+      result = { success: false, message: (err as Error).message };
+    }
+    const actionLatency = performance.now() - actionStart;
+
+    // Log step
+    logger.logStep(
+      step + 1,
+      foregroundApp,
+      elements.length,
+      screenChanged,
+      decision,
+      result,
+      Math.round(llmLatency),
+      Math.round(actionLatency)
+    );
+
+    console.log(`Messages in context: ${trimmed.length}`);
+
+    // 7. Check for goal completion
+    if (decision.action === "done") {
+      console.log("\nTask completed successfully.");
+      logger.finalize(true);
+      return;
+    }
+
+    // Wait for UI to update
+    await Bun.sleep(Config.STEP_DELAY * 1000);
+  }
+
+  console.log("\nMax steps reached. Task may be incomplete.");
+  logger.finalize(false);
+}
+
+// ===========================================
+// Entry Point
+// ===========================================
+
+async function main(): Promise<void> {
+  try {
+    Config.validate();
+  } catch (e) {
+    console.log(`Configuration Error: ${(e as Error).message}`);
+    return;
+  }
+
+  // Read user input from stdin
+  process.stdout.write("Enter your goal: ");
+  const goal = await new Promise<string>((resolve) => {
+    const reader = Bun.stdin.stream().getReader();
+    reader.read().then(({ value }) => {
+      resolve(new TextDecoder().decode(value).trim());
+      reader.releaseLock();
+    });
+  });
+
+  if (!goal) {
+    console.log("No goal provided. Exiting.");
+    return;
+  }
+
+  await runAgent(goal);
+}
+
+main();
@@ -0,0 +1,535 @@
+/**
+ * LLM Provider module for Android Action Kernel.
+ * Supports OpenAI, Groq, AWS Bedrock, and OpenRouter (via Vercel AI SDK).
+ *
+ * Phase 3: Real multimodal vision (image content parts)
+ * Phase 4A: Multi-turn conversation memory (ChatMessage[] interface)
+ * Phase 5: Streaming responses (getDecisionStream)
+ */
+
+import OpenAI from "openai";
+import {
+  BedrockRuntimeClient,
+  InvokeModelCommand,
+  InvokeModelWithResponseStreamCommand,
+} from "@aws-sdk/client-bedrock-runtime";
+import { generateText, streamText } from "ai";
+import { createOpenRouter } from "@openrouter/ai-sdk-provider";
+
+import { Config } from "./config.js";
+import {
+  GROQ_API_BASE_URL,
+  BEDROCK_ANTHROPIC_MODELS,
+  BEDROCK_META_MODELS,
+} from "./constants.js";
+import type { ActionDecision } from "./actions.js";
+
+// ===========================================
+// System Prompt — all 15 actions + planning
+// ===========================================
+
+export const SYSTEM_PROMPT = `You are an Android Driver Agent. Your job is to achieve the user's goal by navigating the Android UI.
+
+You will receive:
+1. GOAL — the user's task.
+2. FOREGROUND_APP — the currently active app package and activity.
+3. SCREEN_CONTEXT — JSON array of interactive UI elements with coordinates and states.
+4. SCREENSHOT — an image of the current screen (when available).
+5. SCREEN_CHANGE — what changed since your last action (or if the screen is stuck).
+6. VISION_FALLBACK — present when the accessibility tree is empty (custom UI / WebView).
+
+Previous conversation turns contain your earlier observations and actions (multi-turn memory).
+
+You must output ONLY a valid JSON object with your next action.
+
+═══════════════════════════════════════════
+THINKING & PLANNING
+═══════════════════════════════════════════
+
+Before each action, include a "think" field with your reasoning about the current state and what to do next.
+
+Optionally include:
+- "plan": an array of 3-5 high-level steps to achieve the goal
+- "planProgress": a brief note on which plan step you're currently on
+
+Example:
+{"think": "I see the Settings app is open. I need to scroll down to find Display settings.", "plan": ["Open Settings", "Navigate to Display", "Change theme to dark", "Verify change"], "planProgress": "Step 2: navigating to Display", "action": "swipe", "direction": "up", "reason": "Scroll down to find Display option"}
+
+═══════════════════════════════════════════
+AVAILABLE ACTIONS (15 total)
+═══════════════════════════════════════════
+
+Navigation:
+  {"action": "tap", "coordinates": [x, y], "reason": "..."}
+  {"action": "longpress", "coordinates": [x, y], "reason": "..."}
+  {"action": "swipe", "direction": "up|down|left|right", "reason": "..."}
+  {"action": "enter", "reason": "Press Enter/submit"}
+  {"action": "back", "reason": "Navigate back"}
+  {"action": "home", "reason": "Go to home screen"}
+
+Text Input:
+  {"action": "type", "text": "Hello World", "reason": "..."}
+  {"action": "clear", "reason": "Clear current text field before typing"}
+
+App Control:
+  {"action": "launch", "package": "com.whatsapp", "reason": "Open WhatsApp"}
+  {"action": "launch", "uri": "https://maps.google.com/?q=pizza", "reason": "Open URL"}
+  {"action": "launch", "package": "com.whatsapp", "uri": "content://media/external/images/1", "extras": {"android.intent.extra.TEXT": "Check this"}, "reason": "Share image to WhatsApp"}
+
+Data:
+  {"action": "screenshot", "reason": "Capture current screen"}
+  {"action": "screenshot", "filename": "order_confirmation.png", "reason": "Save proof"}
+  {"action": "clipboard_get", "reason": "Read clipboard contents"}
+  {"action": "clipboard_set", "text": "copied text", "reason": "Set clipboard"}
+
+System:
+  {"action": "shell", "command": "am force-stop com.app.broken", "reason": "Kill crashed app"}
+  {"action": "wait", "reason": "Wait for screen to load"}
+  {"action": "done", "reason": "Task is complete"}
+
+═══════════════════════════════════════════
+ELEMENT PROPERTIES YOU WILL SEE
+═══════════════════════════════════════════
+
+Each element in SCREEN_CONTEXT has:
+- text: visible label or content description
+- center: [x, y] coordinates to tap
+- action: suggested action — "tap", "type", "longpress", "scroll", or "read"
+- enabled: false (only shown when disabled — DO NOT tap disabled elements!)
+- checked: true (only shown for ON checkboxes/toggles)
+- focused: true (only shown when field has input focus)
+- hint: placeholder text (only shown when present)
+- editable: true (only shown for text input fields)
+- scrollable: true (only shown for scrollable containers)
+
+═══════════════════════════════════════════
+CRITICAL RULES
+═══════════════════════════════════════════
+
+1. DISABLED ELEMENTS: If "enabled": false, DO NOT tap or interact with it. Find an alternative.
+2. TEXT INPUT: If "editable": true, use "clear" first if field has existing text, then "type".
+3. ALREADY TYPED: Check your previous actions. Do NOT re-type text you already entered.
+4. REPETITION: Do NOT tap the same coordinates twice in a row. If it didn't work, try something else.
+5. STUCK: If SCREEN_CHANGE says "NOT changed", your last action had no effect. Change strategy.
+6. APP LAUNCH: Use "launch" to directly open apps instead of hunting for icons on the home screen.
+7. SCREENSHOTS: Use "screenshot" to capture proof of completed tasks (order confirmations, etc).
+8. LONG PRESS: Use "longpress" when you see "longClickable": true (context menus, copy/paste, etc).
+9. SCROLLING: If the item you need isn't visible, "swipe" up/down to scroll and find it.
+10. MULTI-APP: To switch apps, use "home" then "launch" the next app. Or use "back" to return.
+11. PASSWORDS: Never log or output the text of password fields.
+12. DONE: Say "done" as soon as the goal is achieved. Don't keep acting after success.
+13. SEARCH: After typing in a search field, use "enter" to submit the search.
+14. SHARE: To send files/images between apps, use "launch" with uri + extras for Android intents.
+15. CLEANUP: If a popup/ad appears, dismiss it with "back" or tap the close button, then continue.`;
+
+// ===========================================
+// Chat Message Types (Phase 4A)
+// ===========================================
+
+export type ContentPart =
+  | { type: "text"; text: string }
+  | { type: "image"; base64: string; mimeType: "image/png" | "image/jpeg" };
+
+export interface ChatMessage {
+  role: "system" | "user" | "assistant";
+  content: string | ContentPart[];
+}
+
+// ===========================================
+// Provider Interface
+// ===========================================
+
+export interface LLMProvider {
+  readonly capabilities: {
+    supportsImages: boolean;
+    supportsStreaming: boolean;
+  };
+  getDecision(messages: ChatMessage[]): Promise<ActionDecision>;
+  getDecisionStream?(messages: ChatMessage[]): AsyncIterable<string>;
+}
+
+// ===========================================
+// Message Trimming (Phase 4A)
+// ===========================================
+
+/**
+ * Trims conversation messages to keep within history limit.
+ * Always keeps the system message. Drops oldest user/assistant pairs.
+ */
+export function trimMessages(
+  messages: ChatMessage[],
+  maxHistorySteps: number
+): ChatMessage[] {
+  if (messages.length === 0) return messages;
+
+  // System message is always first
+  const system = messages[0].role === "system" ? messages[0] : null;
+  const rest = system ? messages.slice(1) : messages;
+
+  // Count user/assistant pairs (each step = 1 user + 1 assistant)
+  const maxMessages = maxHistorySteps * 2;
+  if (rest.length <= maxMessages) {
+    return messages;
+  }
+
+  const dropped = rest.length - maxMessages;
+  const stepsDropped = Math.floor(dropped / 2);
+  const trimmed = rest.slice(dropped);
+
+  // Insert a summary note
+  const summary: ChatMessage = {
+    role: "user",
+    content: `[${stepsDropped} earlier steps omitted]`,
+  };
+
+  return system ? [system, summary, ...trimmed] : [summary, ...trimmed];
+}
+
+// ===========================================
+// OpenAI / Groq Provider
+// ===========================================
+
+class OpenAIProvider implements LLMProvider {
+  private client: OpenAI;
+  private model: string;
+  readonly capabilities: { supportsImages: boolean; supportsStreaming: boolean };
+
+  constructor() {
+    if (Config.LLM_PROVIDER === "groq") {
+      this.client = new OpenAI({
+        apiKey: Config.GROQ_API_KEY,
+        baseURL: GROQ_API_BASE_URL,
+      });
+      this.model = Config.GROQ_MODEL;
+      this.capabilities = { supportsImages: false, supportsStreaming: true };
+    } else {
+      this.client = new OpenAI({ apiKey: Config.OPENAI_API_KEY });
+      this.model = Config.OPENAI_MODEL;
+      this.capabilities = { supportsImages: true, supportsStreaming: true };
+    }
+  }
+
+  private toOpenAIMessages(
+    messages: ChatMessage[]
+  ): OpenAI.ChatCompletionMessageParam[] {
+    return messages.map((msg) => {
+      if (typeof msg.content === "string") {
+        return { role: msg.role, content: msg.content } as OpenAI.ChatCompletionMessageParam;
+      }
+      // Convert ContentPart[] to OpenAI format
+      const parts: OpenAI.ChatCompletionContentPart[] = msg.content.map(
+        (part) => {
+          if (part.type === "text") {
+            return { type: "text" as const, text: part.text };
+          }
+          // Image — only for OpenAI (Groq skips images)
+          if (this.capabilities.supportsImages) {
+            return {
+              type: "image_url" as const,
+              image_url: {
+                url: `data:${part.mimeType};base64,${part.base64}`,
+                detail: "low" as const,
+              },
+            };
+          }
+          // Groq: convert image to text placeholder
+          return { type: "text" as const, text: "[Screenshot attached]" };
+        }
+      );
+      return {
+        role: msg.role,
+        content: parts,
+      } as OpenAI.ChatCompletionMessageParam;
+    });
+  }
+
+  async getDecision(messages: ChatMessage[]): Promise<ActionDecision> {
+    const openaiMessages = this.toOpenAIMessages(messages);
+    const response = await this.client.chat.completions.create({
+      model: this.model,
+      response_format: { type: "json_object" },
+      messages: openaiMessages,
+    });
+    return parseJsonResponse(response.choices[0].message.content ?? "{}");
+  }
+
+  async *getDecisionStream(messages: ChatMessage[]): AsyncIterable<string> {
+    const openaiMessages = this.toOpenAIMessages(messages);
+    const stream = await this.client.chat.completions.create({
+      model: this.model,
+      response_format: { type: "json_object" },
+      messages: openaiMessages,
+      stream: true,
+    });
+    for await (const chunk of stream) {
+      const content = chunk.choices[0]?.delta?.content;
+      if (content) yield content;
+    }
+  }
+}
+
+// ===========================================
+// OpenRouter Provider (Vercel AI SDK)
+// ===========================================
+
+class OpenRouterProvider implements LLMProvider {
+  private openrouter: ReturnType<typeof createOpenRouter>;
+  private model: string;
+  readonly capabilities = { supportsImages: true, supportsStreaming: true };
+
+  constructor() {
+    this.openrouter = createOpenRouter({
+      apiKey: Config.OPENROUTER_API_KEY,
+    });
+    this.model = Config.OPENROUTER_MODEL;
+  }
+
+  private toVercelMessages(messages: ChatMessage[]) {
+    // Vercel AI SDK uses a similar format but we need to convert images
+    const systemMsg = messages.find((m) => m.role === "system");
+    const nonSystem = messages.filter((m) => m.role !== "system");
+
+    const converted = nonSystem.map((msg) => {
+      if (typeof msg.content === "string") {
+        return { role: msg.role as "user" | "assistant", content: msg.content };
+      }
+      const parts = msg.content.map((part) => {
+        if (part.type === "text") {
+          return { type: "text" as const, text: part.text };
+        }
+        return {
+          type: "image" as const,
+          image: `data:${part.mimeType};base64,${part.base64}`,
+        };
+      });
+      return { role: msg.role as "user" | "assistant", content: parts };
+    });
+
+    return {
+      system: typeof systemMsg?.content === "string" ? systemMsg.content : "",
+      messages: converted,
+    };
+  }
+
+  async getDecision(messages: ChatMessage[]): Promise<ActionDecision> {
+    const { system, messages: converted } = this.toVercelMessages(messages);
+    const result = await generateText({
+      model: this.openrouter.chat(this.model),
+      system,
+      messages: converted as any,
+    });
+    return parseJsonResponse(result.text);
+  }
+
+  async *getDecisionStream(messages: ChatMessage[]): AsyncIterable<string> {
+    const { system, messages: converted } = this.toVercelMessages(messages);
+    const result = streamText({
+      model: this.openrouter.chat(this.model),
+      system,
+      messages: converted as any,
+    });
+    for await (const chunk of result.textStream) {
+      yield chunk;
+    }
+  }
+}
+
+// ===========================================
+// AWS Bedrock Provider
+// ===========================================
+
+class BedrockProvider implements LLMProvider {
+  private client: BedrockRuntimeClient;
+  private model: string;
+  readonly capabilities: { supportsImages: boolean; supportsStreaming: boolean };
+
+  constructor() {
+    this.client = new BedrockRuntimeClient({ region: Config.AWS_REGION });
+    this.model = Config.BEDROCK_MODEL;
+    // Only Anthropic models on Bedrock support images
+    this.capabilities = {
+      supportsImages: this.isAnthropicModel(),
+      supportsStreaming: true,
+    };
+  }
+
+  private isAnthropicModel(): boolean {
+    return BEDROCK_ANTHROPIC_MODELS.some((id) => this.model.includes(id));
+  }
+
+  private isMetaModel(): boolean {
+    return BEDROCK_META_MODELS.some((id) =>
+      this.model.toLowerCase().includes(id)
+    );
+  }
+
+  private buildAnthropicMessages(messages: ChatMessage[]) {
+    const systemMsg = messages.find((m) => m.role === "system");
+    const nonSystem = messages.filter((m) => m.role !== "system");
+
+    const converted = nonSystem.map((msg) => {
+      if (typeof msg.content === "string") {
+        return { role: msg.role, content: msg.content };
+      }
+      const parts = msg.content.map((part) => {
+        if (part.type === "text") {
+          return { type: "text", text: part.text };
+        }
+        return {
+          type: "image",
+          source: {
+            type: "base64",
+            media_type: part.mimeType,
+            data: part.base64,
+          },
+        };
+      });
+      return { role: msg.role, content: parts };
+    });
+
+    return {
+      system: typeof systemMsg?.content === "string" ? systemMsg.content : "",
+      messages: converted,
+    };
+  }
+
+  private buildRequest(messages: ChatMessage[]): string {
+    if (this.isAnthropicModel()) {
+      const { system, messages: converted } = this.buildAnthropicMessages(messages);
+      return JSON.stringify({
+        anthropic_version: "bedrock-2023-05-31",
+        max_tokens: 1024,
+        system,
+        messages: converted,
+      });
+    }
+
+    // For Meta/other models, flatten to single prompt (no multi-turn / image support)
+    const systemContent = messages.find((m) => m.role === "system");
+    const userMessages = messages
+      .filter((m) => m.role === "user")
+      .map((m) =>
+        typeof m.content === "string"
+          ? m.content
+          : m.content
+              .filter((p) => p.type === "text")
+              .map((p) => (p as { type: "text"; text: string }).text)
+              .join("\n")
+      );
+    const lastUserContent = userMessages[userMessages.length - 1] ?? "";
+    const sysText =
+      typeof systemContent?.content === "string" ? systemContent.content : "";
+
+    if (this.isMetaModel()) {
+      return JSON.stringify({
+        prompt: `<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n${sysText}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n${lastUserContent}\n\nRespond with ONLY a valid JSON object, no other text.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n`,
+        max_gen_len: 512,
+        temperature: 0.1,
+      });
+    }
+
+    return JSON.stringify({
+      inputText: `${sysText}\n\n${lastUserContent}\n\nRespond with ONLY a valid JSON object.`,
+      textGenerationConfig: {
+        maxTokenCount: 512,
+        temperature: 0.1,
+      },
+    });
+  }
+
+  private extractResponse(responseBody: Record<string, any>): string {
+    if (this.isAnthropicModel()) {
+      return responseBody.content[0].text;
+    }
+    if (this.isMetaModel()) {
+      return responseBody.generation ?? "";
+    }
+    return responseBody.results[0].outputText;
+  }
+
+  async getDecision(messages: ChatMessage[]): Promise<ActionDecision> {
+    const requestBody = this.buildRequest(messages);
+    const command = new InvokeModelCommand({
+      modelId: this.model,
+      body: new TextEncoder().encode(requestBody),
+      contentType: "application/json",
+      accept: "application/json",
+    });
+
+    const response = await this.client.send(command);
+    const responseBody = JSON.parse(new TextDecoder().decode(response.body));
+    const resultText = this.extractResponse(responseBody);
+    return parseJsonResponse(resultText);
+  }
+
+  async *getDecisionStream(messages: ChatMessage[]): AsyncIterable<string> {
+    if (!this.isAnthropicModel()) {
+      // Fallback: non-streaming for non-Anthropic models
+      const decision = await this.getDecision(messages);
+      yield JSON.stringify(decision);
+      return;
+    }
+
+    const { system, messages: converted } = this.buildAnthropicMessages(messages);
+    const requestBody = JSON.stringify({
+      anthropic_version: "bedrock-2023-05-31",
+      max_tokens: 1024,
+      system,
+      messages: converted,
+    });
+
+    const command = new InvokeModelWithResponseStreamCommand({
+      modelId: this.model,
+      body: new TextEncoder().encode(requestBody),
+      contentType: "application/json",
+    });
+
+    const response = await this.client.send(command);
+    if (response.body) {
+      for await (const event of response.body) {
+        if (event.chunk?.bytes) {
+          const data = JSON.parse(new TextDecoder().decode(event.chunk.bytes));
+          if (data.type === "content_block_delta" && data.delta?.text) {
+            yield data.delta.text;
+          }
+        }
+      }
+    }
+  }
+}
+
+// ===========================================
+// Shared JSON Parsing
+// ===========================================
+
+function parseJsonResponse(text: string): ActionDecision {
+  try {
+    return JSON.parse(text);
+  } catch {
+    // Try to extract JSON from markdown code blocks or mixed text
+    const match = text.match(/\{[\s\S]*?\}/);
+    if (match) {
+      try {
+        return JSON.parse(match[0]);
+      } catch {
+        // fall through
+      }
+    }
+    console.log(`Warning: Could not parse LLM response: ${text.slice(0, 200)}`);
+    return { action: "wait", reason: "Failed to parse response, waiting" };
+  }
+}
+
+// ===========================================
+// Factory
+// ===========================================
+
+export function getLlmProvider(): LLMProvider {
+  if (Config.LLM_PROVIDER === "bedrock") {
+    return new BedrockProvider();
+  }
+  if (Config.LLM_PROVIDER === "openrouter") {
+    return new OpenRouterProvider();
+  }
+  return new OpenAIProvider();
+}
@@ -0,0 +1,129 @@
+/**
+ * Session logging for Android Action Kernel.
+ * Writes incremental .partial.json after each step (crash-safe),
+ * and a final .json summary at session end.
+ */
+
+import { mkdirSync, writeFileSync } from "fs";
+import { join } from "path";
+import type { ActionDecision } from "./actions.js";
+
+export interface StepLog {
+  step: number;
+  timestamp: string;
+  foregroundApp: string | null;
+  elementCount: number;
+  screenChanged: boolean;
+  llmDecision: {
+    action: string;
+    reason?: string;
+    coordinates?: [number, number];
+    text?: string;
+    think?: string;
+    plan?: string[];
+    planProgress?: string;
+  };
+  actionResult: {
+    success: boolean;
+    message: string;
+  };
+  llmLatencyMs: number;
+  actionLatencyMs: number;
+}
+
+export interface SessionSummary {
+  sessionId: string;
+  goal: string;
+  provider: string;
+  model: string;
+  startTime: string;
+  endTime: string;
+  totalSteps: number;
+  successCount: number;
+  failCount: number;
+  completed: boolean;
+  steps: StepLog[];
+}
+
+export class SessionLogger {
+  private sessionId: string;
+  private logDir: string;
+  private steps: StepLog[] = [];
+  private goal: string;
+  private provider: string;
+  private model: string;
+  private startTime: string;
+
+  constructor(logDir: string, goal: string, provider: string, model: string) {
+    this.sessionId = `${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
+    this.logDir = logDir;
+    this.goal = goal;
+    this.provider = provider;
+    this.model = model;
+    this.startTime = new Date().toISOString();
+
+    mkdirSync(this.logDir, { recursive: true });
+  }
+
+  logStep(
+    step: number,
+    foregroundApp: string | null,
+    elementCount: number,
+    screenChanged: boolean,
+    decision: ActionDecision,
+    result: { success: boolean; message: string },
+    llmLatencyMs: number,
+    actionLatencyMs: number
+  ): void {
+    const entry: StepLog = {
+      step,
+      timestamp: new Date().toISOString(),
+      foregroundApp,
+      elementCount,
+      screenChanged,
+      llmDecision: {
+        action: decision.action,
+        reason: decision.reason,
+        coordinates: decision.coordinates,
+        text: decision.text,
+        think: decision.think,
+        plan: decision.plan,
+        planProgress: decision.planProgress,
+      },
+      actionResult: {
+        success: result.success,
+        message: result.message,
+      },
+      llmLatencyMs,
+      actionLatencyMs,
+    };
+    this.steps.push(entry);
+
+    // Write partial file after each step (crash-safe)
+    const partialPath = join(this.logDir, `${this.sessionId}.partial.json`);
+    writeFileSync(partialPath, JSON.stringify(this.buildSummary(false), null, 2));
+  }
+
+  finalize(completed: boolean): void {
+    const summary = this.buildSummary(completed);
+    const finalPath = join(this.logDir, `${this.sessionId}.json`);
+    writeFileSync(finalPath, JSON.stringify(summary, null, 2));
+    console.log(`Session log saved: ${finalPath}`);
+  }
+
+  private buildSummary(completed: boolean): SessionSummary {
+    return {
+      sessionId: this.sessionId,
+      goal: this.goal,
+      provider: this.provider,
+      model: this.model,
+      startTime: this.startTime,
+      endTime: new Date().toISOString(),
+      totalSteps: this.steps.length,
+      successCount: this.steps.filter((s) => s.actionResult.success).length,
+      failCount: this.steps.filter((s) => !s.actionResult.success).length,
+      completed,
+      steps: this.steps,
+    };
+  }
+}
@@ -0,0 +1,249 @@
+/**
+ * XML Sanitizer for Android Action Kernel.
+ * Parses Android Accessibility XML and extracts interactive UI elements
+ * with full state information and parent-child hierarchy context.
+ */
+
+import { XMLParser } from "fast-xml-parser";
+
+export interface UIElement {
+  id: string;
+  text: string;
+  type: string;
+  bounds: string;
+  center: [number, number];
+  size: [number, number];
+  clickable: boolean;
+  editable: boolean;
+  enabled: boolean;
+  checked: boolean;
+  focused: boolean;
+  selected: boolean;
+  scrollable: boolean;
+  longClickable: boolean;
+  password: boolean;
+  hint: string;
+  action: "tap" | "type" | "longpress" | "scroll" | "read";
+  parent: string;
+  depth: number;
+}
+
+/**
+ * Compute a hash of element texts/ids for screen state comparison.
+ */
+export function computeScreenHash(elements: UIElement[]): string {
+  const parts = elements.map(
+    (e) => `${e.id}|${e.text}|${e.center[0]},${e.center[1]}|${e.enabled}|${e.checked}`
+  );
+  return parts.join(";");
+}
+
+/**
+ * Parses Android Accessibility XML and returns a rich list of interactive elements.
+ * Preserves state (enabled, checked, focused) and hierarchy context.
+ */
+export function getInteractiveElements(xmlContent: string): UIElement[] {
+  const parser = new XMLParser({
+    ignoreAttributes: false,
+    attributeNamePrefix: "@_",
+    allowBooleanAttributes: true,
+  });
+
+  let parsed: unknown;
+  try {
+    parsed = parser.parse(xmlContent);
+  } catch {
+    console.log("Warning: Error parsing XML. The screen might be loading.");
+    return [];
+  }
+
+  const elements: UIElement[] = [];
+
+  function walk(node: any, parentLabel: string, depth: number): void {
+    if (!node || typeof node !== "object") return;
+
+    if (node["@_bounds"]) {
+      const isClickable = node["@_clickable"] === "true";
+      const isLongClickable = node["@_long-clickable"] === "true";
+      const isScrollable = node["@_scrollable"] === "true";
+      const isEnabled = node["@_enabled"] !== "false"; // default true
+      const isChecked = node["@_checked"] === "true";
+      const isFocused = node["@_focused"] === "true";
+      const isSelected = node["@_selected"] === "true";
+      const isPassword = node["@_password"] === "true";
+
+      const elementClass = node["@_class"] ?? "";
+      const isEditable =
+        elementClass.includes("EditText") ||
+        elementClass.includes("AutoCompleteTextView") ||
+        node["@_editable"] === "true";
+
+      const text: string = node["@_text"] ?? "";
+      const desc: string = node["@_content-desc"] ?? "";
+      const resourceId: string = node["@_resource-id"] ?? "";
+      const hint: string = node["@_hint"] ?? "";
+
+      // Build a label for this node to use as parent context for children
+      const typeName = elementClass.split(".").pop() ?? "";
+      const nodeLabel = text || desc || resourceId.split("/").pop() || typeName;
+
+      // Determine if this element should be included
+      const isInteractive = isClickable || isEditable || isLongClickable || isScrollable;
+      const hasContent = !!(text || desc);
+
+      if (isInteractive || hasContent) {
+        const bounds: string = node["@_bounds"];
+        try {
+          const coords = bounds
+            .replace("][", ",")
+            .replace("[", "")
+            .replace("]", "")
+            .split(",")
+            .map(Number);
+
+          const [x1, y1, x2, y2] = coords;
+          const centerX = Math.floor((x1 + x2) / 2);
+          const centerY = Math.floor((y1 + y2) / 2);
+          const width = x2 - x1;
+          const height = y2 - y1;
+
+          // Skip zero-size elements (invisible)
+          if (width <= 0 || height <= 0) {
+            // still walk children
+          } else {
+            let suggestedAction: UIElement["action"];
+            if (isEditable) suggestedAction = "type";
+            else if (isLongClickable && !isClickable) suggestedAction = "longpress";
+            else if (isScrollable && !isClickable) suggestedAction = "scroll";
+            else if (isClickable) suggestedAction = "tap";
+            else suggestedAction = "read";
+
+            elements.push({
+              id: resourceId,
+              text: text || desc,
+              type: typeName,
+              bounds,
+              center: [centerX, centerY],
+              size: [width, height],
+              clickable: isClickable,
+              editable: isEditable,
+              enabled: isEnabled,
+              checked: isChecked,
+              focused: isFocused,
+              selected: isSelected,
+              scrollable: isScrollable,
+              longClickable: isLongClickable,
+              password: isPassword,
+              hint: hint,
+              action: suggestedAction,
+              parent: parentLabel,
+              depth,
+            });
+          }
+        } catch {
+          // Skip malformed bounds
+        }
+      }
+
+      // Recurse with updated parent label
+      walkChildren(node, nodeLabel, depth + 1);
+      return;
+    }
+
+    // No bounds on this node — just recurse
+    walkChildren(node, parentLabel, depth);
+  }
+
+  function walkChildren(node: any, parentLabel: string, depth: number): void {
+    if (node.node) {
+      const children = Array.isArray(node.node) ? node.node : [node.node];
+      for (const child of children) {
+        walk(child, parentLabel, depth);
+      }
+    }
+    if (node.hierarchy) {
+      walk(node.hierarchy, parentLabel, depth);
+    }
+  }
+
+  walk(parsed, "root", 0);
+  return elements;
+}
+
+// ===========================================
+// Smart Element Filtering (Phase 2A)
+// ===========================================
+
+/**
+ * Compact representation sent to the LLM — only essential fields.
+ * Non-default flags are included conditionally to minimize tokens.
+ */
+export interface CompactUIElement {
+  text: string;
+  center: [number, number];
+  action: UIElement["action"];
+  // Only included when non-default
+  enabled?: false;
+  checked?: true;
+  focused?: true;
+  hint?: string;
+  editable?: true;
+  scrollable?: true;
+}
+
+/**
+ * Strips a full UIElement to its compact form, omitting default-valued flags.
+ */
+export function compactElement(el: UIElement): CompactUIElement {
+  const compact: CompactUIElement = {
+    text: el.text,
+    center: el.center,
+    action: el.action,
+  };
+  if (!el.enabled) compact.enabled = false;
+  if (el.checked) compact.checked = true;
+  if (el.focused) compact.focused = true;
+  if (el.hint) compact.hint = el.hint;
+  if (el.editable) compact.editable = true;
+  if (el.scrollable) compact.scrollable = true;
+  return compact;
+}
+
+/**
+ * Scores an element for relevance to the LLM.
+ */
+function scoreElement(el: UIElement): number {
+  let score = 0;
+  if (el.enabled) score += 10;
+  if (el.editable) score += 8;
+  if (el.focused) score += 6;
+  if (el.clickable || el.longClickable) score += 5;
+  if (el.text) score += 3;
+  return score;
+}
+
+/**
+ * Deduplicates elements by center coordinates (within tolerance),
+ * scores them, and returns the top N as compact elements.
+ */
+export function filterElements(
+  elements: UIElement[],
+  limit: number
+): CompactUIElement[] {
+  // Deduplicate by center coordinates (5px tolerance)
+  const seen = new Map<string, UIElement>();
+  for (const el of elements) {
+    const bucketX = Math.round(el.center[0] / 5) * 5;
+    const bucketY = Math.round(el.center[1] / 5) * 5;
+    const key = `${bucketX},${bucketY}`;
+    const existing = seen.get(key);
+    if (!existing || scoreElement(el) > scoreElement(existing)) {
+      seen.set(key, el);
+    }
+  }
+
+  // Score, sort descending, take top N
+  const deduped = Array.from(seen.values());
+  deduped.sort((a, b) => scoreElement(b) - scoreElement(a));
+  return deduped.slice(0, limit).map(compactElement);
+}