droidclaw/src/kernel.ts

/**
 * DroidClaw - Main Agent Loop (TypeScript/Bun Edition)
 *
 * An AI agent that controls Android devices through the accessibility API.
 * Uses LLMs to make decisions based on screen context.
 *
 * Features:
 *   - Perception -> Reasoning -> Action loop
 *   - Screen state diffing (stuck loop detection)
 *   - Error recovery with retries
 *   - Vision fallback & always-on multimodal screenshots
 *   - Dynamic early exit on goal completion
 *   - Smart element filtering (compact JSON, top-N scoring)
 *   - Multi-turn conversation memory
 *   - Multi-step planning (think/plan/planProgress)
 *   - Streaming LLM responses
 *   - Session logging with crash-safe partial writes
 *   - Auto-detect screen resolution & foreground app
 *   - 15 actions: tap, type, enter, swipe, home, back, wait, done,
 *     longpress, screenshot, launch, clear, clipboard_get, clipboard_set, shell
 *
 * Usage:
 *     bun run src/kernel.ts
 */

import { existsSync, readFileSync } from "fs";

import { Config } from "./config.js";
import {
  executeAction,
  runAdbCommand,
  getScreenResolution,
  getForegroundApp,
  initDeviceContext,
  sanitizeCoordinates,
  type ActionDecision,
  type ActionResult,
} from "./actions.js";
import { executeSkill } from "./skills.js";
import {
  getLlmProvider,
  trimMessages,
  parseJsonResponse,
  SYSTEM_PROMPT,
  type LLMProvider,
  type ChatMessage,
  type ContentPart,
} from "./llm-providers.js";
import {
  getInteractiveElements,
  computeScreenHash,
  filterElements,
  type UIElement,
} from "./sanitizer.js";
import {
  DEVICE_SCREENSHOT_PATH,
  LOCAL_SCREENSHOT_PATH,
} from "./constants.js";
import { SessionLogger } from "./logger.js";

// ===========================================
// Screen Perception
// ===========================================

interface ScreenState {
  elements: UIElement[];
  compactJson: string;
}

/**
 * Dumps the current UI XML and returns parsed elements + compact filtered JSON for the LLM.
 */
function getScreenState(): ScreenState {
  try {
    runAdbCommand(["shell", "uiautomator", "dump", Config.SCREEN_DUMP_PATH]);
    runAdbCommand(["pull", Config.SCREEN_DUMP_PATH, Config.LOCAL_DUMP_PATH]);
  } catch {
    console.log("Warning: ADB screen capture failed.");
    return { elements: [], compactJson: "Error: Could not capture screen." };
  }

  if (!existsSync(Config.LOCAL_DUMP_PATH)) {
    return { elements: [], compactJson: "Error: Could not capture screen." };
  }

  const xmlContent = readFileSync(Config.LOCAL_DUMP_PATH, "utf-8");
  const elements = getInteractiveElements(xmlContent);
  const compact = filterElements(elements, Config.MAX_ELEMENTS);
  return { elements, compactJson: JSON.stringify(compact) };
}

/**
 * Captures a screenshot and returns the base64-encoded PNG, or null on failure.
 */
function captureScreenshotBase64(): string | null {
  try {
    runAdbCommand(["shell", "screencap", "-p", DEVICE_SCREENSHOT_PATH]);
    runAdbCommand(["pull", DEVICE_SCREENSHOT_PATH, LOCAL_SCREENSHOT_PATH]);
    if (existsSync(LOCAL_SCREENSHOT_PATH)) {
      const buffer = readFileSync(LOCAL_SCREENSHOT_PATH);
      return Buffer.from(buffer).toString("base64");
    }
  } catch {
    console.log("Warning: Screenshot capture failed.");
  }
  return null;
}

// ===========================================
// Screen State Diffing
// ===========================================

interface ScreenDiff {
  changed: boolean;
  addedTexts: string[];
  removedTexts: string[];
  summary: string;
}

function diffScreenState(
  prevElements: UIElement[],
  currElements: UIElement[]
): ScreenDiff {
  const prevTexts = new Set(prevElements.map((e) => e.text).filter(Boolean));
  const currTexts = new Set(currElements.map((e) => e.text).filter(Boolean));

  const addedTexts = [...currTexts].filter((t) => !prevTexts.has(t));
  const removedTexts = [...prevTexts].filter((t) => !currTexts.has(t));

  const prevHash = computeScreenHash(prevElements);
  const currHash = computeScreenHash(currElements);
  const changed = prevHash !== currHash;

  let summary = "";
  if (!changed) {
    summary = "Screen has NOT changed since last action.";
  } else {
    const parts: string[] = [];
    if (addedTexts.length > 0) {
      parts.push(`New on screen: ${addedTexts.slice(0, 5).join(", ")}`);
    }
    if (removedTexts.length > 0) {
      parts.push(`Gone from screen: ${removedTexts.slice(0, 5).join(", ")}`);
    }
    summary = parts.join(". ") || "Screen layout changed.";
  }

  return { changed, addedTexts, removedTexts, summary };
}

// ===========================================
// Streaming LLM Consumer
// ===========================================

async function getDecisionStreaming(
  llm: LLMProvider,
  messages: ChatMessage[]
): Promise<ActionDecision> {
  if (!Config.STREAMING_ENABLED || !llm.capabilities.supportsStreaming || !llm.getDecisionStream) {
    return llm.getDecision(messages);
  }

  let accumulated = "";
  process.stdout.write("Thinking");
  for await (const chunk of llm.getDecisionStream(messages)) {
    accumulated += chunk;
    process.stdout.write(".");
  }
  process.stdout.write("\n");

  return parseJsonResponse(accumulated);
}

// ===========================================
// Main Agent Loop
// ===========================================

export async function runAgent(goal: string, maxSteps?: number): Promise<{ success: boolean; stepsUsed: number }> {
  const steps = maxSteps ?? Config.MAX_STEPS;

  // Phase 1A: Auto-detect screen resolution
  const resolution = getScreenResolution();
  if (resolution) {
    initDeviceContext(resolution);
    console.log(`Screen resolution: ${resolution[0]}x${resolution[1]}`);
  } else {
    console.log("Screen resolution: using default 1080x2400 swipe coords");
  }

  console.log("DroidClaw Started");
  console.log(`Goal: ${goal}`);
  console.log(`Provider: ${Config.LLM_PROVIDER} (${Config.getModel()})`);
  console.log(`Max steps: ${steps} | Step delay: ${Config.STEP_DELAY}s`);
  console.log(`Vision: ${Config.VISION_MODE} | Streaming: ${Config.STREAMING_ENABLED}`);
  console.log(`Max elements: ${Config.MAX_ELEMENTS} | History: ${Config.MAX_HISTORY_STEPS} steps`);

  const llm = getLlmProvider();

  // Phase 2B: Session logging
  const logger = new SessionLogger(
    Config.LOG_DIR,
    goal,
    Config.LLM_PROVIDER,
    Config.getModel()
  );

  // Phase 4A: Multi-turn conversation memory
  const messages: ChatMessage[] = [
    { role: "system", content: SYSTEM_PROMPT },
  ];

  let prevElements: UIElement[] = [];
  let stuckCount = 0;
  let recentActions: string[] = []; // Sliding window of action signatures for repetition detection
  let lastActionFeedback = ""; // Result of previous action to feed back to LLM

  for (let step = 0; step < steps; step++) {
    console.log(`\n--- Step ${step + 1}/${steps} ---`);

    // 1. Perception: Capture screen state
    console.log("Scanning screen...");
    const { elements, compactJson: screenContext } = getScreenState();

    // 1B. Foreground app detection
    const foregroundApp = getForegroundApp();
    if (foregroundApp) {
      console.log(`Foreground: ${foregroundApp}`);
    }

    // 2. Screen diff: detect stuck loops
    let diffContext = "";
    let screenChanged = true;
    if (step > 0) {
      const diff = diffScreenState(prevElements, elements);
      screenChanged = diff.changed;
      diffContext = `\n\nSCREEN_CHANGE: ${diff.summary}`;

      if (!diff.changed) {
        stuckCount++;
        console.log(
          `Warning: Screen unchanged for ${stuckCount} step(s).`
        );
        if (stuckCount >= Config.STUCK_THRESHOLD) {
          console.log(
            `Stuck for ${stuckCount} steps. Injecting recovery hint.`
          );

          // Context-aware recovery hints based on what actions are failing
          const failingTypes = new Set(
            recentActions.slice(-stuckCount).map((a) => a.split("(")[0])
          );

          let hint = `\nWARNING: You have been stuck for ${stuckCount} steps. The screen is NOT changing.`;

          if (failingTypes.has("tap") || failingTypes.has("longpress")) {
            hint +=
              `\nYour tap/press actions are having NO EFFECT. Likely causes:` +
              `\n- The action SUCCEEDED SILENTLY (copy/share/like buttons often work without screen changes). If so, MOVE ON to the next step.` +
              `\n- The element is not actually interactive at those coordinates.` +
              `\n- USE "clipboard_set" to set clipboard text directly instead of UI copy buttons.` +
              `\n- Or just "type" the text directly in the target app — you already have the text from SCREEN_CONTEXT.`;
          }
          if (failingTypes.has("swipe") || failingTypes.has("scroll")) {
            hint +=
              `\nSwiping is having no effect — you may be at the end of scrollable content. Try interacting with visible elements or navigate with "back"/"home".`;
          }

          hint +=
            `\nYour plan is NOT working. You MUST create a completely NEW plan with a different approach. Think about the underlying GOAL, not the specific steps that failed.`;

          diffContext += hint;
        }
      } else {
        stuckCount = 0;
      }
    }
    prevElements = elements;

    // 2B. Repetition detection (persists across screen changes — catches retry loops)
    if (recentActions.length >= 3) {
      const freq = new Map<string, number>();
      for (const a of recentActions) freq.set(a, (freq.get(a) ?? 0) + 1);
      const [topAction, topCount] = [...freq.entries()].reduce(
        (a, b) => (b[1] > a[1] ? b : a),
        ["", 0]
      );
      if (topCount >= 3) {
        diffContext +=
          `\nREPETITION_ALERT: You have attempted "${topAction}" ${topCount} times in recent steps. ` +
          `This action is clearly NOT working — do NOT attempt it again.`;
        if (topAction.includes("tap") || topAction.includes("longpress")) {
          diffContext +=
            ` ALTERNATIVES: (1) If you were copying text, the copy likely already succeeded — move on to the next step. ` +
            `(2) Use "clipboard_set" with the text from SCREEN_CONTEXT to set clipboard directly. ` +
            `(3) Use "type" to enter text directly in the target app. ` +
            `(4) Navigate away with "back" or "home" and try a different path.`;
        }
      }
    }

    // 2C. Drift detection — agent is floundering (swipe/back/wait/screenshot spam)
    if (recentActions.length >= 4) {
      const navigationActions = new Set(["swipe", "scroll", "back", "home", "wait"]);
      const navCount = recentActions
        .slice(-5)
        .filter((a) => navigationActions.has(a.split("(")[0])).length;
      if (navCount >= 4) {
        diffContext +=
          `\nDRIFT_WARNING: Your last ${navCount} actions were all navigation/waiting (swipe, back, wait, screenshot) with no direct interaction. ` +
          `You are not making progress. STOP scrolling/navigating and take a DIRECT action: ` +
          `tap a specific button from SCREEN_CONTEXT, use "type" to enter text, or use "clipboard_set". ` +
          `If you need to submit a query in a chat app, find the Send button in SCREEN_CONTEXT and tap it.`;
      }
    }

    // 3. Vision: capture screenshot based on VISION_MODE or stuck recovery
    let screenshotBase64: string | null = null;
    let visionContext = "";

    const isStuckVision = stuckCount >= 2; // Send screenshot after 2 unchanged steps
    const shouldCaptureVision =
      Config.VISION_MODE === "always" ||
      (Config.VISION_MODE === "fallback" && elements.length === 0) ||
      isStuckVision;

    if (shouldCaptureVision) {
      screenshotBase64 = captureScreenshotBase64();
      if (elements.length === 0) {
        visionContext =
          "\n\nVISION_FALLBACK: The accessibility tree returned NO elements. " +
          "A screenshot has been captured. The screen likely contains custom-drawn " +
          "content (game, WebView, or Flutter). Try using coordinate-based taps on " +
          "common UI positions, or use 'back'/'home' to navigate away.";
      } else if (isStuckVision) {
        visionContext =
          "\n\nVISION_ASSIST: You have been stuck — a screenshot is attached. " +
          "Use the screenshot to VISUALLY identify the correct field positions, " +
          "buttons, and layout. The accessibility tree may be misleading about " +
          "which field is which. Trust what you SEE in the screenshot over the " +
          "element coordinates when they conflict.";
      }
      if (screenshotBase64 && llm.capabilities.supportsImages) {
        console.log(isStuckVision ? "Stuck — sending screenshot for visual assist" : "Sending screenshot to LLM");
      }
    }

    // 4. Build user message with all context
    const foregroundLine = foregroundApp
      ? `FOREGROUND_APP: ${foregroundApp}\n\n`
      : "";
    const actionFeedbackLine = lastActionFeedback
      ? `LAST_ACTION_RESULT: ${lastActionFeedback}\n\n`
      : "";
    const textContent =
      `GOAL: ${goal}\n\n${foregroundLine}${actionFeedbackLine}SCREEN_CONTEXT:\n${screenContext}${diffContext}${visionContext}`;

    // Build content parts (text + optional image)
    const userContent: ContentPart[] = [{ type: "text", text: textContent }];
    if (screenshotBase64 && llm.capabilities.supportsImages) {
      userContent.push({
        type: "image",
        base64: screenshotBase64,
        mimeType: "image/png",
      });
    }

    messages.push({ role: "user", content: userContent });

    // Trim messages to keep within history limit
    const trimmed = trimMessages(messages, Config.MAX_HISTORY_STEPS);

    // 5. Reasoning: Get LLM decision
    const llmStart = performance.now();
    let decision: ActionDecision;
    try {
      decision = await getDecisionStreaming(llm, trimmed);
    } catch (err) {
      console.log(`LLM Error: ${(err as Error).message}`);
      console.log("Falling back to wait action.");
      decision = { action: "wait", reason: "LLM request failed, waiting" };
    }
    const llmLatency = performance.now() - llmStart;

    // Log thinking and planning
    if (decision.think) {
      console.log(`Think: ${decision.think}`);
    }
    if (decision.plan) {
      console.log(`Plan: ${decision.plan.join(" -> ")}`);
    }
    if (decision.planProgress) {
      console.log(`Progress: ${decision.planProgress}`);
    }
    console.log(`Decision: ${decision.action} — ${decision.reason ?? "no reason"} (${Math.round(llmLatency)}ms)`);

    // Append assistant response to conversation
    messages.push({
      role: "assistant",
      content: JSON.stringify(decision),
    });

    // 6. Action: Execute the decision (multi-step actions or basic actions)
    const MULTI_STEP_ACTIONS = ["read_screen", "submit_message", "copy_visible_text", "wait_for_content", "find_and_tap", "compose_email"];
    const actionStart = performance.now();
    let result: ActionResult;
    try {
      if (MULTI_STEP_ACTIONS.includes(decision.action)) {
        result = executeSkill(decision, elements);
      } else {
        result = executeAction(decision);
      }
    } catch (err) {
      console.log(`Action Error: ${(err as Error).message}`);
      result = { success: false, message: (err as Error).message };
    }
    const actionLatency = performance.now() - actionStart;

    // Log step
    logger.logStep(
      step + 1,
      foregroundApp,
      elements.length,
      screenChanged,
      decision,
      result,
      Math.round(llmLatency),
      Math.round(actionLatency)
    );

    // Track action signature for repetition detection
    const actionSig = decision.coordinates
      ? `${decision.action}(${decision.coordinates.join(",")})`
      : decision.action;
    recentActions.push(actionSig);
    if (recentActions.length > 8) recentActions.shift();

    // Capture action result feedback for next iteration
    lastActionFeedback = `${actionSig} → ${result.success ? "OK" : "FAILED"}: ${result.message}`;

    console.log(`Messages in context: ${trimmed.length}`);

    // 7. Check for goal completion
    if (decision.action === "done") {
      console.log("\nTask completed successfully.");
      logger.finalize(true);
      return { success: true, stepsUsed: step + 1 };
    }

    // Wait for UI to update
    await Bun.sleep(Config.STEP_DELAY * 1000);
  }

  console.log("\nMax steps reached. Task may be incomplete.");
  logger.finalize(false);
  return { success: false, stepsUsed: steps };
}

// ===========================================
// Entry Point
// ===========================================

const BANNER = `
╔════════════════════════════════════════════════════════════════════════════╗
║                                                                            ║
║   ██████╗ ██████╗  ██████╗ ██╗██████╗  ██████╗██╗      █████╗ ██╗    ██╗   ║
║   ██╔══██╗██╔══██╗██╔═══██╗██║██╔══██╗██╔════╝██║     ██╔══██╗██║    ██║   ║
║   ██║  ██║██████╔╝██║   ██║██║██║  ██║██║     ██║     ███████║██║ █╗ ██║   ║
║   ██║  ██║██╔══██╗██║   ██║██║██║  ██║██║     ██║     ██╔══██║██║███╗██║   ║
║   ██████╔╝██║  ██║╚██████╔╝██║██████╔╝╚██████╗███████╗██║  ██║╚███╔███╔╝   ║
║   ╚═════╝ ╚═╝  ╚═╝ ╚═════╝ ╚═╝╚═════╝  ╚═════╝╚══════╝╚═╝  ╚═╝ ╚══╝╚══╝.ai   ║
║                                                                            ║
║   Android AI Agent powered by LLMs                                         ║
║                                                                            ║
╚════════════════════════════════════════════════════════════════════════════╝
`;

async function main(): Promise<void> {
  console.log(BANNER);

  try {
    Config.validate();
  } catch (e) {
    console.log(`Configuration Error: ${(e as Error).message}`);
    return;
  }

  // Check for --flow flag (deterministic YAML flows, no LLM)
  const flowIdx = process.argv.findIndex((a) => a === "--flow" || a.startsWith("--flow="));
  if (flowIdx !== -1) {
    const flowArg = process.argv[flowIdx];
    const flowFile = flowArg.includes("=")
      ? flowArg.split("=")[1]
      : process.argv[flowIdx + 1];

    if (!flowFile) {
      console.log("Error: --flow requires a YAML file path.");
      process.exit(1);
    }

    const { runFlow } = await import("./flow.js");
    try {
      const result = await runFlow(flowFile);
      console.log(`\nResult: ${result.success ? "OK" : "FAILED"} (${result.stepsCompleted}/${result.totalSteps} steps)`);
      process.exit(result.success ? 0 : 1);
    } catch (e) {
      console.log(`Flow Error: ${(e as Error).message}`);
      process.exit(1);
    }
  }

  // Check for --workflow flag
  const workflowIdx = process.argv.findIndex((a) => a === "--workflow" || a.startsWith("--workflow="));
  if (workflowIdx !== -1) {
    const arg = process.argv[workflowIdx];
    const workflowFile = arg.includes("=")
      ? arg.split("=")[1]
      : process.argv[workflowIdx + 1];

    if (!workflowFile) {
      console.log("Error: --workflow requires a JSON file path.");
      process.exit(1);
    }

    const { runWorkflow } = await import("./workflow.js");
    const workflow = JSON.parse(await Bun.file(workflowFile).text());
    const result = await runWorkflow(workflow);

    console.log(`\n=== Workflow "${result.name}" ===`);
    for (const step of result.steps) {
      const status = step.success ? "OK" : "FAILED";
      console.log(`  [${status}] ${step.goal} (${step.stepsUsed} steps)${step.error ? ` — ${step.error}` : ""}`);
    }
    console.log(`\nResult: ${result.success ? "All steps completed" : "Some steps failed"}`);
    process.exit(result.success ? 0 : 1);
  }

  // Interactive mode: read goal from stdin
  process.stdout.write("Enter your goal: ");
  const goal = await new Promise<string>((resolve) => {
    const reader = Bun.stdin.stream().getReader();
    reader.read().then(({ value }) => {
      resolve(new TextDecoder().decode(value).trim());
      reader.releaseLock();
    });
  });

  if (!goal) {
    console.log("No goal provided. Exiting.");
    return;
  }

  await runAgent(goal);
}

main();