Flatten project structure: move android-action-kernel/ to root

Removes the unnecessary nesting — all source, config, and docs now live at the project root for simpler paths and commands. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 16:02:40 +05:30
parent 610fd04818
commit 879509aebc
16 changed files with 862 additions and 7 deletions
--- a/src/kernel.ts
+++ b/src/kernel.ts
@@ -0,0 +1,416 @@
+/**
+ * Android Action Kernel - Main Agent Loop (TypeScript/Bun Edition)
+ *
+ * An AI agent that controls Android devices through the accessibility API.
+ * Uses LLMs to make decisions based on screen context.
+ *
+ * Features:
+ *   - Perception -> Reasoning -> Action loop
+ *   - Screen state diffing (stuck loop detection)
+ *   - Error recovery with retries
+ *   - Vision fallback & always-on multimodal screenshots
+ *   - Dynamic early exit on goal completion
+ *   - Smart element filtering (compact JSON, top-N scoring)
+ *   - Multi-turn conversation memory
+ *   - Multi-step planning (think/plan/planProgress)
+ *   - Streaming LLM responses
+ *   - Session logging with crash-safe partial writes
+ *   - Auto-detect screen resolution & foreground app
+ *   - 15 actions: tap, type, enter, swipe, home, back, wait, done,
+ *     longpress, screenshot, launch, clear, clipboard_get, clipboard_set, shell
+ *
+ * Usage:
+ *     bun run src/kernel.ts
+ */
+
+import { existsSync, readFileSync } from "fs";
+
+import { Config } from "./config.js";
+import {
+  executeAction,
+  runAdbCommand,
+  getScreenResolution,
+  getForegroundApp,
+  initDeviceContext,
+  type ActionDecision,
+  type ActionResult,
+} from "./actions.js";
+import {
+  getLlmProvider,
+  trimMessages,
+  SYSTEM_PROMPT,
+  type LLMProvider,
+  type ChatMessage,
+  type ContentPart,
+} from "./llm-providers.js";
+import {
+  getInteractiveElements,
+  computeScreenHash,
+  filterElements,
+  type UIElement,
+} from "./sanitizer.js";
+import {
+  DEVICE_SCREENSHOT_PATH,
+  LOCAL_SCREENSHOT_PATH,
+} from "./constants.js";
+import { SessionLogger } from "./logger.js";
+
+// ===========================================
+// Screen Perception
+// ===========================================
+
+interface ScreenState {
+  elements: UIElement[];
+  compactJson: string;
+}
+
+/**
+ * Dumps the current UI XML and returns parsed elements + compact filtered JSON for the LLM.
+ */
+function getScreenState(): ScreenState {
+  try {
+    runAdbCommand(["shell", "uiautomator", "dump", Config.SCREEN_DUMP_PATH]);
+    runAdbCommand(["pull", Config.SCREEN_DUMP_PATH, Config.LOCAL_DUMP_PATH]);
+  } catch {
+    console.log("Warning: ADB screen capture failed.");
+    return { elements: [], compactJson: "Error: Could not capture screen." };
+  }
+
+  if (!existsSync(Config.LOCAL_DUMP_PATH)) {
+    return { elements: [], compactJson: "Error: Could not capture screen." };
+  }
+
+  const xmlContent = readFileSync(Config.LOCAL_DUMP_PATH, "utf-8");
+  const elements = getInteractiveElements(xmlContent);
+  const compact = filterElements(elements, Config.MAX_ELEMENTS);
+  return { elements, compactJson: JSON.stringify(compact) };
+}
+
+/**
+ * Captures a screenshot and returns the base64-encoded PNG, or null on failure.
+ */
+function captureScreenshotBase64(): string | null {
+  try {
+    runAdbCommand(["shell", "screencap", "-p", DEVICE_SCREENSHOT_PATH]);
+    runAdbCommand(["pull", DEVICE_SCREENSHOT_PATH, LOCAL_SCREENSHOT_PATH]);
+    if (existsSync(LOCAL_SCREENSHOT_PATH)) {
+      const buffer = readFileSync(LOCAL_SCREENSHOT_PATH);
+      return Buffer.from(buffer).toString("base64");
+    }
+  } catch {
+    console.log("Warning: Screenshot capture failed.");
+  }
+  return null;
+}
+
+// ===========================================
+// Screen State Diffing
+// ===========================================
+
+interface ScreenDiff {
+  changed: boolean;
+  addedTexts: string[];
+  removedTexts: string[];
+  summary: string;
+}
+
+function diffScreenState(
+  prevElements: UIElement[],
+  currElements: UIElement[]
+): ScreenDiff {
+  const prevTexts = new Set(prevElements.map((e) => e.text).filter(Boolean));
+  const currTexts = new Set(currElements.map((e) => e.text).filter(Boolean));
+
+  const addedTexts = [...currTexts].filter((t) => !prevTexts.has(t));
+  const removedTexts = [...prevTexts].filter((t) => !currTexts.has(t));
+
+  const prevHash = computeScreenHash(prevElements);
+  const currHash = computeScreenHash(currElements);
+  const changed = prevHash !== currHash;
+
+  let summary = "";
+  if (!changed) {
+    summary = "Screen has NOT changed since last action.";
+  } else {
+    const parts: string[] = [];
+    if (addedTexts.length > 0) {
+      parts.push(`New on screen: ${addedTexts.slice(0, 5).join(", ")}`);
+    }
+    if (removedTexts.length > 0) {
+      parts.push(`Gone from screen: ${removedTexts.slice(0, 5).join(", ")}`);
+    }
+    summary = parts.join(". ") || "Screen layout changed.";
+  }
+
+  return { changed, addedTexts, removedTexts, summary };
+}
+
+// ===========================================
+// Streaming LLM Consumer
+// ===========================================
+
+async function getDecisionStreaming(
+  llm: LLMProvider,
+  messages: ChatMessage[]
+): Promise<ActionDecision> {
+  if (!Config.STREAMING_ENABLED || !llm.capabilities.supportsStreaming || !llm.getDecisionStream) {
+    return llm.getDecision(messages);
+  }
+
+  let accumulated = "";
+  process.stdout.write("Thinking");
+  for await (const chunk of llm.getDecisionStream(messages)) {
+    accumulated += chunk;
+    process.stdout.write(".");
+  }
+  process.stdout.write("\n");
+
+  return parseJsonResponse(accumulated);
+}
+
+/** Simple JSON parser with markdown fallback (duplicated from llm-providers for streaming path) */
+function parseJsonResponse(text: string): ActionDecision {
+  try {
+    return JSON.parse(text);
+  } catch {
+    const match = text.match(/\{[\s\S]*?\}/);
+    if (match) {
+      try {
+        return JSON.parse(match[0]);
+      } catch {
+        // fall through
+      }
+    }
+    console.log(`Warning: Could not parse streamed response: ${text.slice(0, 200)}`);
+    return { action: "wait", reason: "Failed to parse response, waiting" };
+  }
+}
+
+// ===========================================
+// Main Agent Loop
+// ===========================================
+
+async function runAgent(goal: string, maxSteps?: number): Promise<void> {
+  const steps = maxSteps ?? Config.MAX_STEPS;
+
+  // Phase 1A: Auto-detect screen resolution
+  const resolution = getScreenResolution();
+  if (resolution) {
+    initDeviceContext(resolution);
+    console.log(`Screen resolution: ${resolution[0]}x${resolution[1]}`);
+  } else {
+    console.log("Screen resolution: using default 1080x2400 swipe coords");
+  }
+
+  console.log("Android Action Kernel Started");
+  console.log(`Goal: ${goal}`);
+  console.log(`Provider: ${Config.LLM_PROVIDER} (${Config.getModel()})`);
+  console.log(`Max steps: ${steps} | Step delay: ${Config.STEP_DELAY}s`);
+  console.log(`Vision: ${Config.VISION_MODE} | Streaming: ${Config.STREAMING_ENABLED}`);
+  console.log(`Max elements: ${Config.MAX_ELEMENTS} | History: ${Config.MAX_HISTORY_STEPS} steps`);
+
+  const llm = getLlmProvider();
+
+  // Phase 2B: Session logging
+  const logger = new SessionLogger(
+    Config.LOG_DIR,
+    goal,
+    Config.LLM_PROVIDER,
+    Config.getModel()
+  );
+
+  // Phase 4A: Multi-turn conversation memory
+  const messages: ChatMessage[] = [
+    { role: "system", content: SYSTEM_PROMPT },
+  ];
+
+  let prevElements: UIElement[] = [];
+  let stuckCount = 0;
+
+  for (let step = 0; step < steps; step++) {
+    console.log(`\n--- Step ${step + 1}/${steps} ---`);
+
+    // 1. Perception: Capture screen state
+    console.log("Scanning screen...");
+    const { elements, compactJson: screenContext } = getScreenState();
+
+    // 1B. Foreground app detection
+    const foregroundApp = getForegroundApp();
+    if (foregroundApp) {
+      console.log(`Foreground: ${foregroundApp}`);
+    }
+
+    // 2. Screen diff: detect stuck loops
+    let diffContext = "";
+    let screenChanged = true;
+    if (step > 0) {
+      const diff = diffScreenState(prevElements, elements);
+      screenChanged = diff.changed;
+      diffContext = `\n\nSCREEN_CHANGE: ${diff.summary}`;
+
+      if (!diff.changed) {
+        stuckCount++;
+        console.log(
+          `Warning: Screen unchanged for ${stuckCount} step(s).`
+        );
+        if (stuckCount >= Config.STUCK_THRESHOLD) {
+          console.log(
+            `Stuck for ${stuckCount} steps. Injecting recovery hint.`
+          );
+          diffContext +=
+            `\nWARNING: You have been stuck for ${stuckCount} steps. ` +
+            `The screen is NOT changing. Try a DIFFERENT action: ` +
+            `swipe to scroll, press back, go home, or launch a different app.` +
+            `\nYour plan is not working. Create a NEW plan with a different approach.`;
+        }
+      } else {
+        stuckCount = 0;
+      }
+    }
+    prevElements = elements;
+
+    // 3. Vision: capture screenshot based on VISION_MODE
+    let screenshotBase64: string | null = null;
+    let visionContext = "";
+
+    const shouldCaptureVision =
+      Config.VISION_MODE === "always" ||
+      (Config.VISION_MODE === "fallback" && elements.length === 0);
+
+    if (shouldCaptureVision) {
+      screenshotBase64 = captureScreenshotBase64();
+      if (elements.length === 0) {
+        visionContext =
+          "\n\nVISION_FALLBACK: The accessibility tree returned NO elements. " +
+          "A screenshot has been captured. The screen likely contains custom-drawn " +
+          "content (game, WebView, or Flutter). Try using coordinate-based taps on " +
+          "common UI positions, or use 'back'/'home' to navigate away.";
+      }
+      if (screenshotBase64 && llm.capabilities.supportsImages) {
+        console.log("Sending screenshot to LLM");
+      }
+    }
+
+    // 4. Build user message with all context
+    const foregroundLine = foregroundApp
+      ? `FOREGROUND_APP: ${foregroundApp}\n\n`
+      : "";
+    const textContent =
+      `GOAL: ${goal}\n\n${foregroundLine}SCREEN_CONTEXT:\n${screenContext}${diffContext}${visionContext}`;
+
+    // Build content parts (text + optional image)
+    const userContent: ContentPart[] = [{ type: "text", text: textContent }];
+    if (screenshotBase64 && llm.capabilities.supportsImages) {
+      userContent.push({
+        type: "image",
+        base64: screenshotBase64,
+        mimeType: "image/png",
+      });
+    }
+
+    messages.push({ role: "user", content: userContent });
+
+    // Trim messages to keep within history limit
+    const trimmed = trimMessages(messages, Config.MAX_HISTORY_STEPS);
+
+    // 5. Reasoning: Get LLM decision
+    const llmStart = performance.now();
+    let decision: ActionDecision;
+    try {
+      decision = await getDecisionStreaming(llm, trimmed);
+    } catch (err) {
+      console.log(`LLM Error: ${(err as Error).message}`);
+      console.log("Falling back to wait action.");
+      decision = { action: "wait", reason: "LLM request failed, waiting" };
+    }
+    const llmLatency = performance.now() - llmStart;
+
+    // Log thinking and planning
+    if (decision.think) {
+      console.log(`Think: ${decision.think}`);
+    }
+    if (decision.plan) {
+      console.log(`Plan: ${decision.plan.join(" -> ")}`);
+    }
+    if (decision.planProgress) {
+      console.log(`Progress: ${decision.planProgress}`);
+    }
+    console.log(`Decision: ${decision.action} — ${decision.reason ?? "no reason"} (${Math.round(llmLatency)}ms)`);
+
+    // Append assistant response to conversation
+    messages.push({
+      role: "assistant",
+      content: JSON.stringify(decision),
+    });
+
+    // 6. Action: Execute the decision
+    const actionStart = performance.now();
+    let result: ActionResult;
+    try {
+      result = executeAction(decision);
+    } catch (err) {
+      console.log(`Action Error: ${(err as Error).message}`);
+      result = { success: false, message: (err as Error).message };
+    }
+    const actionLatency = performance.now() - actionStart;
+
+    // Log step
+    logger.logStep(
+      step + 1,
+      foregroundApp,
+      elements.length,
+      screenChanged,
+      decision,
+      result,
+      Math.round(llmLatency),
+      Math.round(actionLatency)
+    );
+
+    console.log(`Messages in context: ${trimmed.length}`);
+
+    // 7. Check for goal completion
+    if (decision.action === "done") {
+      console.log("\nTask completed successfully.");
+      logger.finalize(true);
+      return;
+    }
+
+    // Wait for UI to update
+    await Bun.sleep(Config.STEP_DELAY * 1000);
+  }
+
+  console.log("\nMax steps reached. Task may be incomplete.");
+  logger.finalize(false);
+}
+
+// ===========================================
+// Entry Point
+// ===========================================
+
+async function main(): Promise<void> {
+  try {
+    Config.validate();
+  } catch (e) {
+    console.log(`Configuration Error: ${(e as Error).message}`);
+    return;
+  }
+
+  // Read user input from stdin
+  process.stdout.write("Enter your goal: ");
+  const goal = await new Promise<string>((resolve) => {
+    const reader = Bun.stdin.stream().getReader();
+    reader.read().then(({ value }) => {
+      resolve(new TextDecoder().decode(value).trim());
+      reader.releaseLock();
+    });
+  });
+
+  if (!goal) {
+    console.log("No goal provided. Exiting.");
+    return;
+  }
+
+  await runAgent(goal);
+}
+
+main();