From 610fd048189b44060f4fe30d5f19f47449848a40 Mon Sep 17 00:00:00 2001 From: Sanju Sivalingam Date: Fri, 6 Feb 2026 10:32:58 +0530 Subject: [PATCH] 10x improvement: vision, multi-turn memory, planning, streaming, smart filtering, logging - Auto-detect screen resolution and compute dynamic swipe coordinates - Detect foreground app each step via dumpsys activity - Smart element filtering: deduplicate by position, score by relevance, compact to essentials - Session logging with crash-safe .partial.json writes and final summary - Real multimodal vision: send base64 screenshots to LLMs (off/fallback/always modes) - Multi-turn conversation memory: maintain full chat history across steps with trimming - Multi-step planning: think/plan/planProgress fields on every LLM decision - Streaming responses for all 4 providers (OpenAI, Groq, OpenRouter, Bedrock) - Comprehensive README with examples, architecture docs, and troubleshooting Co-Authored-By: Claude Opus 4.6 --- android-action-kernel/.env.example | 27 +- android-action-kernel/README.md | 734 +++++++++++++++++++-- android-action-kernel/src/actions.ts | 70 +- android-action-kernel/src/config.ts | 23 +- android-action-kernel/src/constants.ts | 40 ++ android-action-kernel/src/kernel.ts | 244 +++++-- android-action-kernel/src/llm-providers.ts | 382 ++++++++--- android-action-kernel/src/logger.ts | 129 ++++ android-action-kernel/src/sanitizer.ts | 78 +++ 9 files changed, 1512 insertions(+), 215 deletions(-) create mode 100644 android-action-kernel/src/logger.ts diff --git a/android-action-kernel/.env.example b/android-action-kernel/.env.example index 19990e2..c9a6220 100644 --- a/android-action-kernel/.env.example +++ b/android-action-kernel/.env.example @@ -11,9 +11,32 @@ MAX_RETRIES=3 # Retries on ADB/network failures STUCK_THRESHOLD=3 # Steps before stuck-loop recovery kicks in # =========================================== -# Vision Fallback (when accessibility tree is empty) +# Vision Mode # =========================================== -VISION_ENABLED=true # Auto-capture screenshot when UI elements not found +# "off" — never capture screenshots +# "fallback" — only when accessibility tree is empty (default) +# "always" — send screenshot every step (uses more tokens, best accuracy) +VISION_MODE=fallback + +# =========================================== +# Smart Element Filtering +# =========================================== +MAX_ELEMENTS=40 # Max UI elements sent to LLM (scored & ranked) + +# =========================================== +# Session Logging +# =========================================== +LOG_DIR=logs # Directory for session JSON logs + +# =========================================== +# Multi-turn Memory +# =========================================== +MAX_HISTORY_STEPS=10 # How many past steps to keep in conversation context + +# =========================================== +# Streaming Responses +# =========================================== +STREAMING_ENABLED=true # Stream LLM responses (shows progress dots) # =========================================== # LLM Provider: "groq", "openai", "bedrock", or "openrouter" diff --git a/android-action-kernel/README.md b/android-action-kernel/README.md index 83dacd4..a49a68e 100644 --- a/android-action-kernel/README.md +++ b/android-action-kernel/README.md @@ -1,82 +1,698 @@ # Android Action Kernel -AI agent that controls Android devices through the Accessibility API. Give it a goal in plain English and it autonomously navigates the device using a Perception → Reasoning → Action loop. +An AI agent that takes control of your Android phone. You give it a goal in plain English — "order me a coffee on Swiggy" or "turn on dark mode" — and it figures out what to tap, type, and swipe to get it done. -## How It Works +It works by reading what's on the screen (the accessibility tree, and optionally a screenshot), sending that to an LLM, getting back a JSON action like `{"action": "tap", "coordinates": [540, 1200]}`, and executing it via ADB. Then it reads the screen again, and repeats. Perception, reasoning, action — in a loop, until the goal is achieved or it runs out of steps. -1. **Perceive** — Captures the screen's accessibility tree via `adb shell uiautomator dump`, parses it into interactive UI elements with coordinates and state -2. **Reason** — Sends the screen context, action history, and goal to an LLM which decides the next action as a JSON object -3. **Act** — Executes the action (tap, type, swipe, launch app, etc.) via ADB -4. **Repeat** — Diffs the screen state, detects stuck loops, and continues until the goal is done or max steps reached +Think of it as an autopilot for your Android device, powered by whatever LLM you want. -Falls back to screenshot-based vision when the accessibility tree is empty (games, WebViews, Flutter). +## What it looks like in action -## Prerequisites +``` +$ bun run src/kernel.ts +Enter your goal: Open YouTube and search for "lofi hip hop" -- [Bun](https://bun.sh) 1.0+ -- [Android SDK Platform Tools](https://developer.android.com/tools/releases/platform-tools) (ADB in PATH) -- Android device connected via USB or WiFi ADB -- API key for one of: Groq, OpenAI, AWS Bedrock, or OpenRouter +Screen resolution: 1080x2400 +Android Action Kernel Started +Goal: Open YouTube and search for "lofi hip hop" +Provider: groq (llama-3.3-70b-versatile) +Max steps: 30 | Step delay: 2s +Vision: fallback | Streaming: true +Max elements: 40 | History: 10 steps -## Quick Start +--- Step 1/30 --- +Scanning screen... +Foreground: com.google.android.launcher3/.uioverrides.QuickstepLauncher +Thinking....... +Think: I'm on the home screen. I should launch YouTube directly rather than looking for it. +Plan: Launch YouTube -> Tap search -> Type query -> Press enter -> Done +Progress: Step 1: launching YouTube +Decision: launch — Open YouTube app directly (842ms) +Launching: com.google.android.youtube +Messages in context: 3 + +--- Step 2/30 --- +Scanning screen... +Foreground: com.google.android.youtube/.HomeActivity +Thinking..... +Think: YouTube is open, showing the home feed. I need to tap the search icon. +Progress: Step 2: tapping search +Decision: tap — Tap the search icon at top right (623ms) +Tapping: (978, 142) +Messages in context: 5 + +--- Step 3/30 --- +Scanning screen... +Foreground: com.google.android.youtube/.SearchActivity +Thinking.... +Think: Search field is focused and ready for input. +Progress: Step 3: typing query +Decision: type — Type the search query (501ms) +Typing: lofi hip hop +Messages in context: 7 + +--- Step 4/30 --- +Scanning screen... +Thinking... +Decision: enter — Submit the search (389ms) +Pressing Enter +Messages in context: 9 + +--- Step 5/30 --- +Scanning screen... +Thinking.... +Think: Search results are showing lofi hip hop videos. Goal achieved. +Decision: done — Search results for "lofi hip hop" are displayed (412ms) + +Task completed successfully. +Session log saved: logs/1706234567890-a3f2k1.json +``` + +## Quick start + +You need three things: Bun (the JavaScript runtime), ADB (Android Debug Bridge), and an API key for at least one LLM provider. + +### 1. Install prerequisites + +**Bun** (if you don't have it): +```bash +curl -fsSL https://bun.sh/install | bash +``` + +**ADB** — comes with Android SDK Platform Tools: +```bash +# macOS +brew install android-platform-tools + +# Ubuntu/Debian +sudo apt install android-tools-adb + +# Or download directly from https://developer.android.com/tools/releases/platform-tools +``` + +### 2. Connect your Android device + +Plug in your phone via USB, or connect over WiFi: + +```bash +# USB — just plug it in, then verify: +adb devices +# Should show your device ID + +# WiFi (after initial USB connection): +adb tcpip 5555 +adb connect 192.168.1.42:5555 +``` + +You'll need to enable **USB Debugging** on your phone: +- Go to Settings > About Phone > tap "Build Number" 7 times (unlocks Developer Options) +- Go to Settings > Developer Options > enable "USB Debugging" +- When you connect, tap "Allow" on the USB debugging prompt + +### 3. Install dependencies and configure ```bash cd android-action-kernel bun install cp .env.example .env -# Edit .env — set LLM_PROVIDER and the corresponding API key +``` + +Edit `.env` and add your API key. The fastest way to get started is with Groq (free tier): + +```bash +# .env +LLM_PROVIDER=groq +GROQ_API_KEY=gsk_your_key_here +``` + +### 4. Run it + +```bash bun run src/kernel.ts ``` -The agent will prompt you for a goal, then start controlling the device. +It'll prompt you for a goal. Type what you want the phone to do, and watch it go. -## Configuration +## Goals you can try -Copy `.env.example` to `.env`. Key settings: - -| Variable | Default | Description | -|---|---|---| -| `LLM_PROVIDER` | `groq` | `groq`, `openai`, `bedrock`, or `openrouter` | -| `MAX_STEPS` | `30` | Maximum actions before stopping | -| `STEP_DELAY` | `2` | Seconds between actions (lets UI settle) | -| `STUCK_THRESHOLD` | `3` | Unchanged screens before recovery kicks in | -| `VISION_ENABLED` | `true` | Screenshot fallback when accessibility tree is empty | - -### LLM Providers - -| Provider | Key Variable | Default Model | -|---|---|---| -| Groq (free tier) | `GROQ_API_KEY` | `llama-3.3-70b-versatile` | -| OpenAI | `OPENAI_API_KEY` | `gpt-4o` | -| AWS Bedrock | AWS credential chain | `us.meta.llama3-3-70b-instruct-v1:0` | -| OpenRouter | `OPENROUTER_API_KEY` | `anthropic/claude-3.5-sonnet` | - -## Available Actions - -The agent can perform 15 actions: - -| Category | Actions | -|---|---| -| Navigation | `tap`, `longpress`, `swipe`, `enter`, `back`, `home` | -| Text | `type`, `clear` | -| App Control | `launch` (by package, activity, or URI with extras) | -| Data | `screenshot`, `clipboard_get`, `clipboard_set` | -| System | `shell`, `wait`, `done` | - -## Project Structure +Here are some real tasks the kernel can handle — from simple to complex: ``` -src/ - kernel.ts # Main agent loop (entry point) - actions.ts # ADB action implementations with retry - llm-providers.ts # LLM abstraction (OpenAI, Groq, Bedrock, OpenRouter) - sanitizer.ts # Accessibility XML parser - config.ts # Environment config loader - constants.ts # ADB keycodes, coordinates, defaults +# Simple navigation +Open Settings +Go to the home screen and open the calculator + +# App-specific tasks +Open WhatsApp and send "I'm running late" to Mom +Open Chrome and search for "best restaurants near me" +Open YouTube and play the first trending video + +# System settings +Turn on WiFi +Set display brightness to maximum +Enable dark mode + +# Multi-app workflows +Take a screenshot of the weather app and share it on WhatsApp +Open Google Maps, search for "coffee", and navigate to the nearest one +Copy the tracking number from the Amazon order page and search it on Google + +# Complex tasks +Order a medium pepperoni pizza from Dominos +Book an Uber from home to the airport +Check my Gmail for any emails from Amazon and read the latest one ``` -## Notes +The kernel knows 15 different actions and uses them in combination. For a multi-step task like "order a pizza", it might: launch the app, tap the search bar, type "pepperoni", tap a result, scroll down to size options, tap "medium", add to cart — each step reasoning about what's on screen right now. -- Swipe coordinates in `constants.ts` are calibrated for 1080px-wide screens. Adjust `SWIPE_COORDS` for different resolutions. -- The agent automatically detects stuck loops and injects recovery hints after `STUCK_THRESHOLD` steps without screen changes. -- ADB commands retry with exponential backoff (up to `MAX_RETRIES` attempts). +## Choosing an LLM provider + +The kernel works with four providers. You only need one. + +| Provider | Best for | Vision? | Cost | +|---|---|---|---| +| **Groq** | Getting started, fast iteration | No | Free tier available | +| **OpenAI** | Best accuracy with GPT-4o, full vision | Yes | Pay per token | +| **OpenRouter** | Access to 200+ models (Claude, Gemini, etc.) | Yes | Pay per token | +| **AWS Bedrock** | Enterprise, Claude on AWS | Yes (Anthropic models) | Pay per token | + +### Groq (recommended to start) + +Groq is the fastest and has a generous free tier. It doesn't support vision (screenshots), but the accessibility tree is usually enough. + +```bash +LLM_PROVIDER=groq +GROQ_API_KEY=gsk_your_key_here +GROQ_MODEL=llama-3.3-70b-versatile +# Faster but less capable: llama-3.1-8b-instant +``` + +Get your key at [console.groq.com](https://console.groq.com). + +### OpenAI + +GPT-4o gives the best results, especially with vision enabled. More expensive but more reliable on complex tasks. + +```bash +LLM_PROVIDER=openai +OPENAI_API_KEY=sk-your_key_here +OPENAI_MODEL=gpt-4o +# Cheaper alternative: gpt-4o-mini +``` + +### OpenRouter + +One API key, access to hundreds of models. Great if you want to try Claude, Gemini, or open-source models. + +```bash +LLM_PROVIDER=openrouter +OPENROUTER_API_KEY=sk-or-v1-your_key_here +OPENROUTER_MODEL=anthropic/claude-3.5-sonnet +# Other popular options: +# google/gemini-2.0-flash-001 (fast + cheap) +# openai/gpt-4o (multimodal) +# deepseek/deepseek-chat (cost efficient) +``` + +Get your key at [openrouter.ai/keys](https://openrouter.ai/keys). + +### AWS Bedrock + +For enterprise setups. Uses your AWS credential chain (`aws configure`), no API key needed in `.env`. + +```bash +LLM_PROVIDER=bedrock +AWS_REGION=us-east-1 +BEDROCK_MODEL=anthropic.claude-3-sonnet-20240229-v1:0 +# Or: us.meta.llama3-3-70b-instruct-v1:0 +``` + +## Configuration deep dive + +All settings live in `.env`. Here's what each one does and when you'd change it. + +### Agent behavior + +```bash +MAX_STEPS=30 # How many perception-action cycles before giving up. + # Simple tasks finish in 3-5 steps. Complex multi-app + # workflows might need 20+. Default is 30. + +STEP_DELAY=2 # Seconds to wait after each action before reading the + # screen again. This gives the UI time to settle. Too low + # and you'll read a screen mid-animation. Default is 2. + +STUCK_THRESHOLD=3 # If the screen hasn't changed for this many steps, the + # kernel injects a "you're stuck, try something else" + # hint into the LLM prompt. Default is 3. + +MAX_RETRIES=3 # Retries on ADB failures (device disconnected, etc.) + # with exponential backoff. Default is 3. +``` + +### Vision + +```bash +VISION_MODE=fallback # Controls when screenshots are sent to the LLM. + # + # "off" — Never send screenshots. Cheapest option. + # Works fine for most native Android apps + # where the accessibility tree has all + # the info the LLM needs. + # + # "fallback" — Only capture a screenshot when the + # accessibility tree returns zero elements. + # This happens with games, WebViews, Flutter + # apps, and custom-drawn UI. This is the + # default and a good middle ground. + # + # "always" — Send a screenshot every single step. Most + # accurate but uses significantly more tokens. + # Use this with GPT-4o or Claude when you + # need the LLM to actually *see* the screen + # (visual layouts, icons without text labels, + # image recognition, etc.) +``` + +### Smart filtering + +```bash +MAX_ELEMENTS=40 # A typical Android screen has 50-200 UI elements. Most + # of them are decorative (dividers, spacers, background + # containers). This setting caps how many elements the + # LLM sees after scoring and deduplication. Default is 40. + # + # Lower = faster + cheaper but may miss relevant buttons. + # Higher = more context but slower + more expensive. +``` + +### Conversation memory + +```bash +MAX_HISTORY_STEPS=10 # The LLM sees the full conversation history — its + # previous observations and decisions. This setting caps + # how many past steps to keep (oldest get trimmed). + # Default is 10. Higher values give the LLM more context + # about what it already tried, but use more tokens. +``` + +### Streaming + +```bash +STREAMING_ENABLED=true # When true, LLM responses stream in token-by-token, + # showing progress dots (".....") in the terminal so + # you know it's working. When false, the full response + # comes back all at once after a longer pause. +``` + +### Logging + +```bash +LOG_DIR=logs # Where session logs are written. Each run produces a + # JSON file with every step: what was on screen, what + # the LLM decided, how long it took, success/failure. + # Great for debugging and replays. +``` + +## How it works under the hood + +The kernel runs a loop. Each iteration has six phases: + +``` + +------------------+ + | 1. PERCEIVE | Dump the accessibility tree via uiautomator + | the screen | Parse XML into UI elements, detect foreground app + +--------+---------+ + | + +--------v---------+ + | 2. FILTER | Score elements (editable +8, focused +6, clickable +5...) + | & COMPACT | Deduplicate by position, keep top 40, strip to essentials + +--------+---------+ + | + +--------v---------+ + | 3. CAPTURE | If VISION_MODE allows: take a screenshot, + | screenshot | encode as base64 for the LLM + +--------+---------+ + | + +--------v---------+ + | 4. REASON | Send the full conversation (system prompt + all past + | via LLM | observations + current screen + screenshot) to the LLM + +--------+---------+ + | + +--------v---------+ + | 5. EXECUTE | Parse the LLM's JSON response, run the ADB command + | action | (tap, type, swipe, launch, etc.) + +--------+---------+ + | + +--------v---------+ + | 6. LOG & DIFF | Write step to session log, diff screen state, + | | detect stuck loops, wait for UI to settle + +--------+---------+ + | + +---------> back to step 1 +``` + +### The 15 actions + +The LLM responds with JSON. Here's every action it can choose: + +**Navigation** — moving around the UI +```json +{"action": "tap", "coordinates": [540, 1200], "reason": "Tap the Sign In button"} +{"action": "longpress", "coordinates": [540, 1200], "reason": "Long press to open context menu"} +{"action": "swipe", "direction": "up", "reason": "Scroll down to see more items"} +{"action": "enter", "reason": "Submit the search query"} +{"action": "back", "reason": "Go back to previous screen"} +{"action": "home", "reason": "Return to home screen"} +``` + +**Text input** — typing and clearing +```json +{"action": "clear", "reason": "Clear old text before typing new query"} +{"action": "type", "text": "best pizza near me", "reason": "Type the search query"} +``` + +**App control** — launching apps and intents +```json +{"action": "launch", "package": "com.whatsapp", "reason": "Open WhatsApp"} +{"action": "launch", "uri": "https://maps.google.com/?q=pizza+near+me", "reason": "Open Maps with search"} +{"action": "launch", "package": "com.whatsapp", "uri": "content://media/external/images/1", + "extras": {"android.intent.extra.TEXT": "Check this out"}, "reason": "Share image to WhatsApp"} +``` + +**Data** — screenshots and clipboard +```json +{"action": "screenshot", "reason": "Capture current screen state"} +{"action": "screenshot", "filename": "order_confirmation.png", "reason": "Save proof of purchase"} +{"action": "clipboard_get", "reason": "Read what's in clipboard"} +{"action": "clipboard_set", "text": "Hello", "reason": "Copy text to clipboard"} +``` + +**System** — shell commands, waiting, and completion +```json +{"action": "shell", "command": "am force-stop com.broken.app", "reason": "Kill unresponsive app"} +{"action": "wait", "reason": "Wait for page to finish loading"} +{"action": "done", "reason": "Goal is complete — search results are showing"} +``` + +### Planning and thinking + +The LLM doesn't just pick actions blindly. Before every action, it reasons about what it sees and where it is in its plan: + +```json +{ + "think": "I see the YouTube search results page. The first video is 'lofi hip hop radio - beats to relax/study to' which matches what the user wants. I should tap on it to start playing.", + "plan": [ + "Launch YouTube", + "Search for lofi hip hop", + "Play the first result", + "Verify it's playing" + ], + "planProgress": "Step 3 of 4: playing the first result", + "action": "tap", + "coordinates": [540, 450], + "reason": "Tap the first search result to play it" +} +``` + +The `think` field shows the LLM's reasoning — what it sees on screen and why it's making this choice. The `plan` is a high-level roadmap it creates at the start and updates as it goes. `planProgress` tracks where it is in that roadmap. + +When the agent gets stuck (screen not changing for several steps), the kernel tells the LLM: "Your plan isn't working. Create a new one with a different approach." This forces it to re-evaluate rather than stubbornly retrying the same thing. + +### Smart element filtering + +A raw Android accessibility dump can have 150+ elements. Most are useless — background containers, invisible spacers, decorative views. Sending all of them wastes tokens and confuses the LLM. + +The kernel filters elements in three steps: + +**1. Deduplication** — Elements at the same coordinates (within 5px) are collapsed. If two elements overlap at the same spot, the one with the higher relevance score wins. + +**2. Scoring** — Each element gets a score based on how useful it is: + +| Property | Score | Why | +|---|---|---| +| Enabled | +10 | Disabled elements can't be interacted with | +| Editable (text field) | +8 | You almost always need to interact with input fields | +| Currently focused | +6 | Focused elements are what the user is likely targeting | +| Clickable / long-clickable | +5 | Buttons and links are primary interaction targets | +| Has visible text | +3 | Elements with text are more informative | + +**3. Compaction** — The top N elements (default 40) are stripped to just the fields the LLM needs. A "Sign In" button goes from this: + +```json +{ + "id": "com.app:id/login_btn", + "text": "Sign In", + "type": "Button", + "bounds": "[360,1740][720,1860]", + "center": [540, 1800], + "size": [360, 120], + "clickable": true, + "editable": false, + "enabled": true, + "checked": false, + "focused": false, + "selected": false, + "scrollable": false, + "longClickable": false, + "password": false, + "hint": "", + "action": "tap", + "parent": "LoginForm", + "depth": 4 +} +``` + +To this: + +```json +{"text": "Sign In", "center": [540, 1800], "action": "tap"} +``` + +The LLM gets exactly what it needs — the text label, where to tap, and what kind of interaction is expected. Default flags (enabled=true, checked=false, etc.) are omitted since they add no information. Non-default flags are included, so a disabled button would show `"enabled": false` as a clear signal not to tap it. + +### Multi-turn conversation memory + +This is one of the most important features. Unlike a stateless setup where every LLM call starts fresh, the kernel maintains a full conversation across all steps. Each step adds two messages: + +- A **user message** with the current screen state (filtered elements + screenshot + foreground app + screen diff) +- An **assistant message** with the LLM's JSON decision + +So by step 5, the LLM has the full history of what it saw and did in steps 1 through 4. It remembers that it already typed "lofi hip hop" and won't type it again. It knows it already tried tapping a certain button and it didn't work. It can reference earlier observations to make better decisions. + +To keep context from growing forever, the kernel trims it to the last N steps (default 10), always keeping the system prompt. Older steps get replaced with a brief `[5 earlier steps omitted]` marker. + +Here's what the conversation looks like internally by step 3: + +``` +[system] You are an Android Driver Agent... +[user] GOAL: Open YouTube... FOREGROUND_APP: launcher... SCREEN_CONTEXT: [...] +[assistant] {"action": "launch", "package": "com.google.android.youtube", ...} +[user] GOAL: Open YouTube... FOREGROUND_APP: youtube/HomeActivity... SCREEN_CONTEXT: [...] +[assistant] {"action": "tap", "coordinates": [978, 142], ...} +[user] GOAL: Open YouTube... FOREGROUND_APP: youtube/SearchActivity... SCREEN_CONTEXT: [...] +``` + +The LLM sees this entire chain and can reason about it: "I already launched YouTube (step 1) and tapped search (step 2). Now I see the search field is focused, so I should type the query." + +### Screen resolution detection + +The kernel auto-detects your device's screen resolution at startup by running `adb shell wm size`. It checks for an override resolution first (set by apps or developer settings), then falls back to the physical resolution. + +Swipe coordinates are calculated as proportional ratios of the screen dimensions. The reference device is 1080x2400, and all coordinates scale proportionally: + +- **Scroll up**: swipe from 62.5% to 20.8% of screen height (center X) +- **Scroll down**: swipe from 20.8% to 62.5% of screen height (center X) +- **Swipe left**: swipe from 74.1% to 18.5% of screen width (center Y) +- **Swipe right**: swipe from 18.5% to 74.1% of screen width (center Y) + +This means swiping works correctly whether you have a 1080x2400 phone, a 1440x3200 tablet, a 720x1280 budget phone, or a custom emulator resolution. If resolution detection fails, it falls back to the hardcoded 1080x2400 coordinates. + +### Session logging + +Every run produces a JSON log in the `logs/` directory. The log captures everything: + +- **Session metadata** — goal, LLM provider, model, start/end timestamps +- **Per-step data** — foreground app, element count, screen change status, full LLM decision (including think/plan), action result, LLM latency, action latency +- **Summary stats** — total steps, success/failure counts, whether the task completed + +A `.partial.json` file is written after every single step, so even if the process crashes mid-run, you don't lose the data. At the end of the run, the final `.json` summary is written and logged to console. + +Here's what one step looks like in the log: + +```json +{ + "step": 3, + "timestamp": "2024-01-25T14:30:22.456Z", + "foregroundApp": "com.google.android.youtube/.SearchActivity", + "elementCount": 47, + "screenChanged": true, + "llmDecision": { + "action": "type", + "text": "lofi hip hop", + "reason": "Type search query", + "think": "Search field is focused and empty. Time to type the query.", + "plan": ["Launch YouTube", "Tap search", "Type query", "Submit", "Done"], + "planProgress": "Step 3: typing the search query" + }, + "actionResult": { + "success": true, + "message": "Typed \"lofi hip hop\"" + }, + "llmLatencyMs": 623, + "actionLatencyMs": 145 +} +``` + +These logs are useful for: +- **Debugging** — why did the agent tap the wrong button on step 7? +- **Performance tuning** — which steps are slow? Is the LLM taking too long? +- **Replays** — trace through the exact sequence of observations and decisions +- **Comparing models** — run the same goal with different LLMs and compare their logs + +### Streaming + +When streaming is enabled (the default), LLM responses arrive token-by-token instead of all at once. In the terminal, you'll see progress dots while the LLM thinks: + +``` +Thinking........... +``` + +Each dot is a chunk of the response arriving. This matters because some LLM calls take 2-5 seconds, and without streaming you'd just see a frozen terminal with no feedback. Under the hood: + +- **OpenAI / Groq**: Uses `stream: true` on the chat completions API +- **OpenRouter**: Uses Vercel AI SDK's `streamText()` which returns a `.textStream` async iterable +- **Bedrock (Anthropic)**: Uses `InvokeModelWithResponseStreamCommand` for native streaming +- **Bedrock (Meta/other)**: Falls back to non-streaming (these models don't support it through Bedrock's streaming API) + +If you set `STREAMING_ENABLED=false`, every provider falls back to the standard request-response pattern. + +## Architecture + +Seven source files, no subdirectories, no frameworks beyond the LLM SDKs: + +``` +android-action-kernel/src/ + kernel.ts Main agent loop — ties everything together + actions.ts 15 action implementations + device detection + ADB retry logic + llm-providers.ts LLM abstraction (4 providers) + system prompt + message types + sanitizer.ts Accessibility XML parser + smart filtering + scoring + config.ts Reads .env into a typed Config object + constants.ts ADB keycodes, coordinate ratios, defaults, magic values + logger.ts Session logging with crash-safe partial writes +``` + +### Data flow in one step + +``` +kernel.ts actions.ts sanitizer.ts + | | | + |-- runAdbCommand() -------->| | + | "uiautomator dump" | | + |<-- XML file pulled --------| | + | | | + |-- getInteractiveElements() --------------------------->| + | (parse raw XML) | | + |<-- UIElement[] ----------------------------------------| + | | | + |-- filterElements() ----------------------------------->| + | (score, dedup, compact) | | + |<-- CompactUIElement[] ---------------------------------| + | | | + |-- getForegroundApp() ----->| | + |<-- "com.app/.Activity" ---| | + | | | + | llm-providers.ts | + | | | + |-- getDecisionStream() --->| | + | (messages[]) |-- (calls OpenAI/etc.) -->| + |<-- ActionDecision --------| | + | | | + |-- executeAction() ------->| | + | (tap, type, swipe...) |-- runAdbCommand() ------>| + |<-- ActionResult ----------| | + | | | + | logger.ts | + | | | + |-- logStep() ------------->| | + | (writes .partial.json) | | +``` + +### Extending the kernel + +**Adding a new LLM provider:** + +1. Implement the `LLMProvider` interface in `llm-providers.ts`: +```typescript +export interface LLMProvider { + readonly capabilities: { + supportsImages: boolean; // Can this provider handle base64 screenshots? + supportsStreaming: boolean; // Does it support token-by-token streaming? + }; + getDecision(messages: ChatMessage[]): Promise; + getDecisionStream?(messages: ChatMessage[]): AsyncIterable; +} +``` + +2. Add a case to the `getLlmProvider()` factory +3. Add config fields to `config.ts` and env vars to `.env.example` + +**Adding a new action:** + +1. Add any new fields to `ActionDecision` in `actions.ts` +2. Write an `executeNewAction()` function +3. Add the case to the `executeAction()` switch +4. Document the JSON format in the `SYSTEM_PROMPT` in `llm-providers.ts` — this is how the LLM learns the action exists + +## Commands + +```bash +bun install # Install dependencies (run this first) +bun run src/kernel.ts # Start the agent (prompts for a goal) +bun run build # Compile to dist/ (bun build --target bun) +bun run typecheck # Type-check with zero errors (tsc --noEmit) +``` + +## Troubleshooting + +**"adb: command not found"** +ADB isn't in your PATH. Either install it via your package manager (see Quick Start) or set `ADB_PATH=/full/path/to/adb` in `.env`. + +**"no devices/emulators found"** +Your phone isn't connected or USB debugging isn't enabled. Run `adb devices` — you should see a device ID, not an empty list. Check that you tapped "Allow" on the USB debugging prompt on your phone. + +**"Warning: ADB screen capture failed"** +Sometimes `uiautomator dump` fails transiently — the screen is in transition, an animation is playing, or the device is briefly unresponsive. The kernel retries automatically with exponential backoff. If it keeps failing, try increasing `STEP_DELAY` to give the UI more time to settle between steps. + +**"Warning: Could not detect screen resolution"** +The kernel falls back to default 1080x2400 coordinates. Swipes might not scroll correctly on devices with very different resolutions. You can check your device manually with `adb shell wm size`. + +**The agent keeps doing the same thing over and over** +This is the "stuck loop" problem. The kernel detects it automatically after `STUCK_THRESHOLD` steps (default 3) and tells the LLM to try a completely different approach. If it's still stuck after that, the task might need a more capable model. Try GPT-4o or Claude via OpenRouter — they're significantly better at complex multi-step reasoning than smaller models. + +**"Could not parse LLM response"** +The LLM sometimes returns malformed JSON, especially cheaper/smaller models. The kernel has fallback parsing — it tries to extract JSON from markdown code blocks and mixed text. If parsing fails completely, it falls back to a "wait" action and tries again next step. If this happens frequently, switch to a larger model. OpenAI and Groq have `response_format: json_object` enabled which almost eliminates this problem. + +**Vision not working with Groq** +Groq doesn't support image inputs. The kernel handles this gracefully — it sends a `[Screenshot attached]` text placeholder instead of the actual image. If you need the LLM to actually see the screen (for games, WebViews, apps with no accessibility labels), use OpenAI, OpenRouter, or Bedrock with an Anthropic model. + +**High token usage / expensive runs** +A few things to try: +- Set `VISION_MODE=off` if you don't need screenshots (biggest token saver) +- Lower `MAX_ELEMENTS` from 40 to 20 or 25 +- Lower `MAX_HISTORY_STEPS` from 10 to 5 +- Use a cheaper model (gpt-4o-mini, llama-3.1-8b-instant, deepseek-chat) + +## How is this different from... + +**Appium / UIAutomator2 test frameworks** — Those require you to write explicit test scripts with selectors, waits, and assertions. This kernel is goal-driven: you say *what* you want, and the LLM figures out *how*. No selectors, no XPaths, no test scripts. The tradeoff is that it's non-deterministic — the LLM might take a slightly different path each time. + +**Phone mirroring tools (scrcpy, Vysor)** — Those let *you* control the phone remotely with your own hands. This lets an *AI* control it autonomously. Different use case entirely. + +**Android accessibility services** — Those run *on the phone* as installed apps. This runs on *your computer* and talks to the device over ADB. No app installation required on the phone — just USB debugging enabled. + +**Cloud device farms (BrowserStack, Firebase Test Lab)** — Those are designed for automated testing at scale. This is designed for single-device autonomous task completion. You could potentially use this kernel with a cloud device, but that's not the primary use case. + +## License + +MIT diff --git a/android-action-kernel/src/actions.ts b/android-action-kernel/src/actions.ts index 14d1839..8723704 100644 --- a/android-action-kernel/src/actions.ts +++ b/android-action-kernel/src/actions.ts @@ -20,6 +20,7 @@ import { LONG_PRESS_DURATION_MS, DEVICE_SCREENSHOT_PATH, LOCAL_SCREENSHOT_PATH, + computeSwipeCoords, } from "./constants.js"; export interface ActionDecision { @@ -37,6 +38,10 @@ export interface ActionDecision { command?: string; // screenshot action filename?: string; + // planning fields (Phase 4B) + think?: string; + plan?: string[]; + planProgress?: string; } export interface ActionResult { @@ -75,6 +80,68 @@ export function runAdbCommand(command: string[], retries = Config.MAX_RETRIES): return ""; } +// =========================================== +// Device Intelligence (Phase 1) +// =========================================== + +/** Module-level dynamic swipe coords, set by initDeviceContext() */ +let dynamicSwipeCoords: Record | null = null; + +/** + * Detects the connected device's screen resolution via ADB. + * Returns [width, height] or null on failure. + */ +export function getScreenResolution(): [number, number] | null { + try { + const output = runAdbCommand(["shell", "wm", "size"]); + // Try "Override size:" first, then "Physical size:" + const overrideMatch = output.match(/Override size:\s*(\d+)x(\d+)/); + if (overrideMatch) { + return [parseInt(overrideMatch[1], 10), parseInt(overrideMatch[2], 10)]; + } + const physicalMatch = output.match(/Physical size:\s*(\d+)x(\d+)/); + if (physicalMatch) { + return [parseInt(physicalMatch[1], 10), parseInt(physicalMatch[2], 10)]; + } + } catch { + console.log("Warning: Could not detect screen resolution."); + } + return null; +} + +/** + * Detects the currently running foreground app. + * Returns "package/activity" or null on failure. + */ +export function getForegroundApp(): string | null { + try { + const output = runAdbCommand([ + "shell", "dumpsys", "activity", "activities", + ]); + // Match mResumedActivity line + const match = output.match(/mResumedActivity.*?(\S+\/\S+)/); + if (match) { + return match[1].replace("}", ""); + } + } catch { + // ignore + } + return null; +} + +/** + * Stores dynamic swipe coordinates based on detected resolution. + * Must be called once at startup. + */ +export function initDeviceContext(resolution: [number, number]): void { + dynamicSwipeCoords = computeSwipeCoords(resolution[0], resolution[1]); +} + +/** Returns dynamic swipe coords if set, otherwise falls back to hardcoded defaults. */ +function getSwipeCoords(): Record { + return dynamicSwipeCoords ?? SWIPE_COORDS; +} + /** * Executes the action decided by the LLM. Returns a result for the kernel to track. */ @@ -156,7 +223,8 @@ function executeEnter(): ActionResult { function executeSwipe(action: ActionDecision): ActionResult { const direction = action.direction ?? "up"; - const coords = SWIPE_COORDS[direction] ?? SWIPE_COORDS["up"]; + const swipeCoords = getSwipeCoords(); + const coords = swipeCoords[direction] ?? swipeCoords["up"]; console.log(`Swiping ${direction}`); runAdbCommand([ diff --git a/android-action-kernel/src/config.ts b/android-action-kernel/src/config.ts index 1f95493..11cce55 100644 --- a/android-action-kernel/src/config.ts +++ b/android-action-kernel/src/config.ts @@ -15,7 +15,12 @@ import { DEFAULT_BEDROCK_MODEL, DEFAULT_MAX_RETRIES, DEFAULT_STUCK_THRESHOLD, - DEFAULT_VISION_ENABLED, + DEFAULT_MAX_ELEMENTS, + DEFAULT_LOG_DIR, + DEFAULT_VISION_MODE, + DEFAULT_MAX_HISTORY_STEPS, + DEFAULT_STREAMING_ENABLED, + type VisionMode, } from "./constants.js"; function env(key: string, fallback = ""): string { @@ -36,8 +41,20 @@ export const Config = { MAX_RETRIES: parseInt(env("MAX_RETRIES", String(DEFAULT_MAX_RETRIES)), 10), STUCK_THRESHOLD: parseInt(env("STUCK_THRESHOLD", String(DEFAULT_STUCK_THRESHOLD)), 10), - // Vision fallback (when accessibility tree is empty) - VISION_ENABLED: env("VISION_ENABLED", String(DEFAULT_VISION_ENABLED)) === "true", + // Vision mode: "off" | "fallback" (only when tree empty) | "always" (every step) + VISION_MODE: (env("VISION_MODE", DEFAULT_VISION_MODE) as VisionMode), + + // Smart element filtering + MAX_ELEMENTS: parseInt(env("MAX_ELEMENTS", String(DEFAULT_MAX_ELEMENTS)), 10), + + // Session logging + LOG_DIR: env("LOG_DIR", DEFAULT_LOG_DIR), + + // Multi-turn memory + MAX_HISTORY_STEPS: parseInt(env("MAX_HISTORY_STEPS", String(DEFAULT_MAX_HISTORY_STEPS)), 10), + + // Streaming responses + STREAMING_ENABLED: env("STREAMING_ENABLED", String(DEFAULT_STREAMING_ENABLED)) === "true", // LLM Provider: "groq", "openai", "bedrock", or "openrouter" LLM_PROVIDER: env("LLM_PROVIDER", "groq"), diff --git a/android-action-kernel/src/constants.ts b/android-action-kernel/src/constants.ts index 498468a..e2414ff 100644 --- a/android-action-kernel/src/constants.ts +++ b/android-action-kernel/src/constants.ts @@ -37,12 +37,38 @@ export const SCREEN_CENTER_X = 540; export const SCREEN_CENTER_Y = 1200; // Swipe coordinates: [start_x, start_y, end_x, end_y] +// These are the fallback values for 1080x2400 screens export const SWIPE_COORDS: Record = { up: [SCREEN_CENTER_X, 1500, SCREEN_CENTER_X, 500], down: [SCREEN_CENTER_X, 500, SCREEN_CENTER_X, 1500], left: [800, SCREEN_CENTER_Y, 200, SCREEN_CENTER_Y], right: [200, SCREEN_CENTER_Y, 800, SCREEN_CENTER_Y], }; + +/** + * Derives swipe coordinates from actual screen dimensions using ratios + * from the hardcoded 1080x2400 reference values. + */ +export function computeSwipeCoords( + width: number, + height: number +): Record { + const cx = Math.floor(width / 2); + const cy = Math.floor(height / 2); + // Vertical swipe: from 62.5% to 20.8% of height (mirrors 1500→500 on 2400h) + const vTop = Math.floor(height * 0.208); + const vBottom = Math.floor(height * 0.625); + // Horizontal swipe: from 74% to 18.5% of width (mirrors 800→200 on 1080w) + const hLeft = Math.floor(width * 0.185); + const hRight = Math.floor(width * 0.741); + + return { + up: [cx, vBottom, cx, vTop], + down: [cx, vTop, cx, vBottom], + left: [hRight, cy, hLeft, cy], + right: [hLeft, cy, hRight, cy], + }; +} export const SWIPE_DURATION_MS = "300"; export const LONG_PRESS_DURATION_MS = "1000"; @@ -76,3 +102,17 @@ export const DEFAULT_STEP_DELAY = 2.0; export const DEFAULT_MAX_RETRIES = 3; export const DEFAULT_STUCK_THRESHOLD = 3; export const DEFAULT_VISION_ENABLED = true; + +// Phase 2: Context Quality +export const DEFAULT_MAX_ELEMENTS = 40; +export const DEFAULT_LOG_DIR = "logs"; + +// Phase 3: Vision Mode +export type VisionMode = "off" | "fallback" | "always"; +export const DEFAULT_VISION_MODE: VisionMode = "fallback"; + +// Phase 4: Multi-turn Memory +export const DEFAULT_MAX_HISTORY_STEPS = 10; + +// Phase 5: Streaming +export const DEFAULT_STREAMING_ENABLED = true; diff --git a/android-action-kernel/src/kernel.ts b/android-action-kernel/src/kernel.ts index fdb36bf..db96018 100644 --- a/android-action-kernel/src/kernel.ts +++ b/android-action-kernel/src/kernel.ts @@ -5,11 +5,17 @@ * Uses LLMs to make decisions based on screen context. * * Features: - * - Perception → Reasoning → Action loop + * - Perception -> Reasoning -> Action loop * - Screen state diffing (stuck loop detection) * - Error recovery with retries - * - Vision fallback when accessibility tree is empty + * - Vision fallback & always-on multimodal screenshots * - Dynamic early exit on goal completion + * - Smart element filtering (compact JSON, top-N scoring) + * - Multi-turn conversation memory + * - Multi-step planning (think/plan/planProgress) + * - Streaming LLM responses + * - Session logging with crash-safe partial writes + * - Auto-detect screen resolution & foreground app * - 15 actions: tap, type, enter, swipe, home, back, wait, done, * longpress, screenshot, launch, clear, clipboard_get, clipboard_set, shell * @@ -23,55 +29,73 @@ import { Config } from "./config.js"; import { executeAction, runAdbCommand, + getScreenResolution, + getForegroundApp, + initDeviceContext, type ActionDecision, type ActionResult, } from "./actions.js"; -import { getLlmProvider, type LLMProvider } from "./llm-providers.js"; +import { + getLlmProvider, + trimMessages, + SYSTEM_PROMPT, + type LLMProvider, + type ChatMessage, + type ContentPart, +} from "./llm-providers.js"; import { getInteractiveElements, computeScreenHash, + filterElements, type UIElement, } from "./sanitizer.js"; import { DEVICE_SCREENSHOT_PATH, LOCAL_SCREENSHOT_PATH, } from "./constants.js"; +import { SessionLogger } from "./logger.js"; // =========================================== // Screen Perception // =========================================== +interface ScreenState { + elements: UIElement[]; + compactJson: string; +} + /** - * Dumps the current UI XML and returns parsed elements + JSON string. + * Dumps the current UI XML and returns parsed elements + compact filtered JSON for the LLM. */ -function getScreenState(): { elements: UIElement[]; json: string } { +function getScreenState(): ScreenState { try { runAdbCommand(["shell", "uiautomator", "dump", Config.SCREEN_DUMP_PATH]); runAdbCommand(["pull", Config.SCREEN_DUMP_PATH, Config.LOCAL_DUMP_PATH]); } catch { console.log("Warning: ADB screen capture failed."); - return { elements: [], json: "Error: Could not capture screen." }; + return { elements: [], compactJson: "Error: Could not capture screen." }; } if (!existsSync(Config.LOCAL_DUMP_PATH)) { - return { elements: [], json: "Error: Could not capture screen." }; + return { elements: [], compactJson: "Error: Could not capture screen." }; } const xmlContent = readFileSync(Config.LOCAL_DUMP_PATH, "utf-8"); const elements = getInteractiveElements(xmlContent); - return { elements, json: JSON.stringify(elements, null, 2) }; + const compact = filterElements(elements, Config.MAX_ELEMENTS); + return { elements, compactJson: JSON.stringify(compact) }; } /** - * Captures a screenshot and returns the local file path. - * Used as a vision fallback when the accessibility tree is empty. + * Captures a screenshot and returns the base64-encoded PNG, or null on failure. */ -function captureScreenshot(): string | null { +function captureScreenshotBase64(): string | null { try { runAdbCommand(["shell", "screencap", "-p", DEVICE_SCREENSHOT_PATH]); runAdbCommand(["pull", DEVICE_SCREENSHOT_PATH, LOCAL_SCREENSHOT_PATH]); if (existsSync(LOCAL_SCREENSHOT_PATH)) { - return LOCAL_SCREENSHOT_PATH; + const buffer = readFileSync(LOCAL_SCREENSHOT_PATH); + return Buffer.from(buffer).toString("base64"); } } catch { console.log("Warning: Screenshot capture failed."); @@ -122,37 +146,44 @@ function diffScreenState( } // =========================================== -// Action History Formatting +// Streaming LLM Consumer // =========================================== -function formatActionHistory( - actionHistory: ActionDecision[], - resultHistory: ActionResult[] -): string { - if (actionHistory.length === 0) return ""; +async function getDecisionStreaming( + llm: LLMProvider, + messages: ChatMessage[] +): Promise { + if (!Config.STREAMING_ENABLED || !llm.capabilities.supportsStreaming || !llm.getDecisionStream) { + return llm.getDecision(messages); + } - const lines = actionHistory.map((entry, i) => { - const actionType = entry.action ?? "unknown"; - const reason = entry.reason ?? "N/A"; - const result = resultHistory[i]; - const outcome = result ? (result.success ? "OK" : "FAILED") : ""; + let accumulated = ""; + process.stdout.write("Thinking"); + for await (const chunk of llm.getDecisionStream(messages)) { + accumulated += chunk; + process.stdout.write("."); + } + process.stdout.write("\n"); - if (actionType === "type") { - return `Step ${i + 1}: typed "${entry.text ?? ""}" - ${reason} [${outcome}]`; - } - if (actionType === "tap") { - return `Step ${i + 1}: tapped ${JSON.stringify(entry.coordinates ?? [])} - ${reason} [${outcome}]`; - } - if (actionType === "launch") { - return `Step ${i + 1}: launched ${entry.package ?? entry.uri ?? ""} - ${reason} [${outcome}]`; - } - if (actionType === "screenshot") { - return `Step ${i + 1}: took screenshot - ${reason} [${outcome}]`; - } - return `Step ${i + 1}: ${actionType} - ${reason} [${outcome}]`; - }); + return parseJsonResponse(accumulated); +} - return "\n\nPREVIOUS_ACTIONS:\n" + lines.join("\n"); +/** Simple JSON parser with markdown fallback (duplicated from llm-providers for streaming path) */ +function parseJsonResponse(text: string): ActionDecision { + try { + return JSON.parse(text); + } catch { + const match = text.match(/\{[\s\S]*?\}/); + if (match) { + try { + return JSON.parse(match[0]); + } catch { + // fall through + } + } + console.log(`Warning: Could not parse streamed response: ${text.slice(0, 200)}`); + return { action: "wait", reason: "Failed to parse response, waiting" }; + } } // =========================================== @@ -162,15 +193,37 @@ function formatActionHistory( async function runAgent(goal: string, maxSteps?: number): Promise { const steps = maxSteps ?? Config.MAX_STEPS; + // Phase 1A: Auto-detect screen resolution + const resolution = getScreenResolution(); + if (resolution) { + initDeviceContext(resolution); + console.log(`Screen resolution: ${resolution[0]}x${resolution[1]}`); + } else { + console.log("Screen resolution: using default 1080x2400 swipe coords"); + } + console.log("Android Action Kernel Started"); console.log(`Goal: ${goal}`); console.log(`Provider: ${Config.LLM_PROVIDER} (${Config.getModel()})`); console.log(`Max steps: ${steps} | Step delay: ${Config.STEP_DELAY}s`); - console.log(`Vision fallback: ${Config.VISION_ENABLED ? "ON" : "OFF"}`); + console.log(`Vision: ${Config.VISION_MODE} | Streaming: ${Config.STREAMING_ENABLED}`); + console.log(`Max elements: ${Config.MAX_ELEMENTS} | History: ${Config.MAX_HISTORY_STEPS} steps`); const llm = getLlmProvider(); - const actionHistory: ActionDecision[] = []; - const resultHistory: ActionResult[] = []; + + // Phase 2B: Session logging + const logger = new SessionLogger( + Config.LOG_DIR, + goal, + Config.LLM_PROVIDER, + Config.getModel() + ); + + // Phase 4A: Multi-turn conversation memory + const messages: ChatMessage[] = [ + { role: "system", content: SYSTEM_PROMPT }, + ]; + let prevElements: UIElement[] = []; let stuckCount = 0; @@ -179,12 +232,20 @@ async function runAgent(goal: string, maxSteps?: number): Promise { // 1. Perception: Capture screen state console.log("Scanning screen..."); - const { elements, json: screenContext } = getScreenState(); + const { elements, compactJson: screenContext } = getScreenState(); + + // 1B. Foreground app detection + const foregroundApp = getForegroundApp(); + if (foregroundApp) { + console.log(`Foreground: ${foregroundApp}`); + } // 2. Screen diff: detect stuck loops let diffContext = ""; + let screenChanged = true; if (step > 0) { const diff = diffScreenState(prevElements, elements); + screenChanged = diff.changed; diffContext = `\n\nSCREEN_CHANGE: ${diff.summary}`; if (!diff.changed) { @@ -199,7 +260,8 @@ async function runAgent(goal: string, maxSteps?: number): Promise { diffContext += `\nWARNING: You have been stuck for ${stuckCount} steps. ` + `The screen is NOT changing. Try a DIFFERENT action: ` + - `swipe to scroll, press back, go home, or launch a different app.`; + `swipe to scroll, press back, go home, or launch a different app.` + + `\nYour plan is not working. Create a NEW plan with a different approach.`; } } else { stuckCount = 0; @@ -207,39 +269,82 @@ async function runAgent(goal: string, maxSteps?: number): Promise { } prevElements = elements; - // 3. Vision fallback: if accessibility tree is empty, use screenshot + // 3. Vision: capture screenshot based on VISION_MODE + let screenshotBase64: string | null = null; let visionContext = ""; - if (elements.length === 0 && Config.VISION_ENABLED) { - console.log("Accessibility tree empty. Attempting vision fallback..."); - const screenshotPath = captureScreenshot(); - if (screenshotPath) { + + const shouldCaptureVision = + Config.VISION_MODE === "always" || + (Config.VISION_MODE === "fallback" && elements.length === 0); + + if (shouldCaptureVision) { + screenshotBase64 = captureScreenshotBase64(); + if (elements.length === 0) { visionContext = "\n\nVISION_FALLBACK: The accessibility tree returned NO elements. " + "A screenshot has been captured. The screen likely contains custom-drawn " + "content (game, WebView, or Flutter). Try using coordinate-based taps on " + - "common UI positions, or use 'back'/'home' to navigate away. " + - "If you know the app package name, use 'launch' to restart it."; - console.log("Vision fallback: screenshot captured for context."); + "common UI positions, or use 'back'/'home' to navigate away."; + } + if (screenshotBase64 && llm.capabilities.supportsImages) { + console.log("Sending screenshot to LLM"); } } - // 4. Reasoning: Get LLM decision - console.log("Thinking..."); - const historyStr = formatActionHistory(actionHistory, resultHistory); - const fullContext = screenContext + historyStr + diffContext + visionContext; + // 4. Build user message with all context + const foregroundLine = foregroundApp + ? `FOREGROUND_APP: ${foregroundApp}\n\n` + : ""; + const textContent = + `GOAL: ${goal}\n\n${foregroundLine}SCREEN_CONTEXT:\n${screenContext}${diffContext}${visionContext}`; + // Build content parts (text + optional image) + const userContent: ContentPart[] = [{ type: "text", text: textContent }]; + if (screenshotBase64 && llm.capabilities.supportsImages) { + userContent.push({ + type: "image", + base64: screenshotBase64, + mimeType: "image/png", + }); + } + + messages.push({ role: "user", content: userContent }); + + // Trim messages to keep within history limit + const trimmed = trimMessages(messages, Config.MAX_HISTORY_STEPS); + + // 5. Reasoning: Get LLM decision + const llmStart = performance.now(); let decision: ActionDecision; try { - decision = await llm.getDecision(goal, fullContext, actionHistory); + decision = await getDecisionStreaming(llm, trimmed); } catch (err) { console.log(`LLM Error: ${(err as Error).message}`); console.log("Falling back to wait action."); - decision = { action: "wait", reason: "LLM request failed, waiting for retry" }; + decision = { action: "wait", reason: "LLM request failed, waiting" }; } + const llmLatency = performance.now() - llmStart; - console.log(`Decision: ${decision.action} — ${decision.reason ?? "no reason"}`); + // Log thinking and planning + if (decision.think) { + console.log(`Think: ${decision.think}`); + } + if (decision.plan) { + console.log(`Plan: ${decision.plan.join(" -> ")}`); + } + if (decision.planProgress) { + console.log(`Progress: ${decision.planProgress}`); + } + console.log(`Decision: ${decision.action} — ${decision.reason ?? "no reason"} (${Math.round(llmLatency)}ms)`); - // 5. Action: Execute the decision + // Append assistant response to conversation + messages.push({ + role: "assistant", + content: JSON.stringify(decision), + }); + + // 6. Action: Execute the decision + const actionStart = performance.now(); let result: ActionResult; try { result = executeAction(decision); @@ -247,14 +352,26 @@ async function runAgent(goal: string, maxSteps?: number): Promise { console.log(`Action Error: ${(err as Error).message}`); result = { success: false, message: (err as Error).message }; } + const actionLatency = performance.now() - actionStart; - // Track history - actionHistory.push(decision); - resultHistory.push(result); + // Log step + logger.logStep( + step + 1, + foregroundApp, + elements.length, + screenChanged, + decision, + result, + Math.round(llmLatency), + Math.round(actionLatency) + ); - // 6. Check for goal completion + console.log(`Messages in context: ${trimmed.length}`); + + // 7. Check for goal completion if (decision.action === "done") { console.log("\nTask completed successfully."); + logger.finalize(true); return; } @@ -263,6 +380,7 @@ async function runAgent(goal: string, maxSteps?: number): Promise { } console.log("\nMax steps reached. Task may be incomplete."); + logger.finalize(false); } // =========================================== diff --git a/android-action-kernel/src/llm-providers.ts b/android-action-kernel/src/llm-providers.ts index 64a654b..18b3e17 100644 --- a/android-action-kernel/src/llm-providers.ts +++ b/android-action-kernel/src/llm-providers.ts @@ -1,14 +1,19 @@ /** * LLM Provider module for Android Action Kernel. * Supports OpenAI, Groq, AWS Bedrock, and OpenRouter (via Vercel AI SDK). + * + * Phase 3: Real multimodal vision (image content parts) + * Phase 4A: Multi-turn conversation memory (ChatMessage[] interface) + * Phase 5: Streaming responses (getDecisionStream) */ import OpenAI from "openai"; import { BedrockRuntimeClient, InvokeModelCommand, + InvokeModelWithResponseStreamCommand, } from "@aws-sdk/client-bedrock-runtime"; -import { generateText } from "ai"; +import { generateText, streamText } from "ai"; import { createOpenRouter } from "@openrouter/ai-sdk-provider"; import { Config } from "./config.js"; @@ -20,20 +25,36 @@ import { import type { ActionDecision } from "./actions.js"; // =========================================== -// System Prompt — all 15 actions + rich element context +// System Prompt — all 15 actions + planning // =========================================== -const SYSTEM_PROMPT = `You are an Android Driver Agent. Your job is to achieve the user's goal by navigating the Android UI. +export const SYSTEM_PROMPT = `You are an Android Driver Agent. Your job is to achieve the user's goal by navigating the Android UI. You will receive: 1. GOAL — the user's task. -2. SCREEN_CONTEXT — JSON array of interactive UI elements with coordinates, states, and hierarchy. -3. PREVIOUS_ACTIONS — your action history with outcomes (OK/FAILED). -4. SCREEN_CHANGE — what changed since your last action (or if the screen is stuck). -5. VISION_FALLBACK — present when the accessibility tree is empty (custom UI / WebView). +2. FOREGROUND_APP — the currently active app package and activity. +3. SCREEN_CONTEXT — JSON array of interactive UI elements with coordinates and states. +4. SCREENSHOT — an image of the current screen (when available). +5. SCREEN_CHANGE — what changed since your last action (or if the screen is stuck). +6. VISION_FALLBACK — present when the accessibility tree is empty (custom UI / WebView). + +Previous conversation turns contain your earlier observations and actions (multi-turn memory). You must output ONLY a valid JSON object with your next action. +═══════════════════════════════════════════ +THINKING & PLANNING +═══════════════════════════════════════════ + +Before each action, include a "think" field with your reasoning about the current state and what to do next. + +Optionally include: +- "plan": an array of 3-5 high-level steps to achieve the goal +- "planProgress": a brief note on which plan step you're currently on + +Example: +{"think": "I see the Settings app is open. I need to scroll down to find Display settings.", "plan": ["Open Settings", "Navigate to Display", "Change theme to dark", "Verify change"], "planProgress": "Step 2: navigating to Display", "action": "swipe", "direction": "up", "reason": "Scroll down to find Display option"} + ═══════════════════════════════════════════ AVAILABLE ACTIONS (15 total) ═══════════════════════════════════════════ @@ -73,18 +94,13 @@ ELEMENT PROPERTIES YOU WILL SEE Each element in SCREEN_CONTEXT has: - text: visible label or content description - center: [x, y] coordinates to tap -- size: [width, height] in pixels -- enabled: whether the element can be interacted with (DO NOT tap disabled elements!) -- checked: checkbox/toggle state (true = ON) -- focused: whether this field currently has input focus -- selected: whether this item is currently selected (tabs, list items) -- scrollable: whether this container can be scrolled -- longClickable: supports long-press for context menu -- editable: text input field -- password: password input (don't read/log the text) -- hint: placeholder text shown when field is empty -- parent: the containing element (helps understand layout hierarchy) - action: suggested action — "tap", "type", "longpress", "scroll", or "read" +- enabled: false (only shown when disabled — DO NOT tap disabled elements!) +- checked: true (only shown for ON checkboxes/toggles) +- focused: true (only shown when field has input focus) +- hint: placeholder text (only shown when present) +- editable: true (only shown for text input fields) +- scrollable: true (only shown for scrollable containers) ═══════════════════════════════════════════ CRITICAL RULES @@ -92,7 +108,7 @@ CRITICAL RULES 1. DISABLED ELEMENTS: If "enabled": false, DO NOT tap or interact with it. Find an alternative. 2. TEXT INPUT: If "editable": true, use "clear" first if field has existing text, then "type". -3. ALREADY TYPED: Check PREVIOUS_ACTIONS. Do NOT re-type text you already entered. +3. ALREADY TYPED: Check your previous actions. Do NOT re-type text you already entered. 4. REPETITION: Do NOT tap the same coordinates twice in a row. If it didn't work, try something else. 5. STUCK: If SCREEN_CHANGE says "NOT changed", your last action had no effect. Change strategy. 6. APP LAUNCH: Use "launch" to directly open apps instead of hunting for icons on the home screen. @@ -106,25 +122,67 @@ CRITICAL RULES 14. SHARE: To send files/images between apps, use "launch" with uri + extras for Android intents. 15. CLEANUP: If a popup/ad appears, dismiss it with "back" or tap the close button, then continue.`; +// =========================================== +// Chat Message Types (Phase 4A) +// =========================================== + +export type ContentPart = + | { type: "text"; text: string } + | { type: "image"; base64: string; mimeType: "image/png" | "image/jpeg" }; + +export interface ChatMessage { + role: "system" | "user" | "assistant"; + content: string | ContentPart[]; +} + // =========================================== // Provider Interface // =========================================== -interface ActionHistoryEntry { - action?: string; - reason?: string; - text?: string; - coordinates?: [number, number]; - package?: string; - uri?: string; +export interface LLMProvider { + readonly capabilities: { + supportsImages: boolean; + supportsStreaming: boolean; + }; + getDecision(messages: ChatMessage[]): Promise; + getDecisionStream?(messages: ChatMessage[]): AsyncIterable; } -export interface LLMProvider { - getDecision( - goal: string, - screenContext: string, - actionHistory: ActionHistoryEntry[] - ): Promise; +// =========================================== +// Message Trimming (Phase 4A) +// =========================================== + +/** + * Trims conversation messages to keep within history limit. + * Always keeps the system message. Drops oldest user/assistant pairs. + */ +export function trimMessages( + messages: ChatMessage[], + maxHistorySteps: number +): ChatMessage[] { + if (messages.length === 0) return messages; + + // System message is always first + const system = messages[0].role === "system" ? messages[0] : null; + const rest = system ? messages.slice(1) : messages; + + // Count user/assistant pairs (each step = 1 user + 1 assistant) + const maxMessages = maxHistorySteps * 2; + if (rest.length <= maxMessages) { + return messages; + } + + const dropped = rest.length - maxMessages; + const stepsDropped = Math.floor(dropped / 2); + const trimmed = rest.slice(dropped); + + // Insert a summary note + const summary: ChatMessage = { + role: "user", + content: `[${stepsDropped} earlier steps omitted]`, + }; + + return system ? [system, summary, ...trimmed] : [summary, ...trimmed]; } // =========================================== @@ -134,6 +192,7 @@ export interface LLMProvider { class OpenAIProvider implements LLMProvider { private client: OpenAI; private model: string; + readonly capabilities: { supportsImages: boolean; supportsStreaming: boolean }; constructor() { if (Config.LLM_PROVIDER === "groq") { @@ -142,30 +201,70 @@ class OpenAIProvider implements LLMProvider { baseURL: GROQ_API_BASE_URL, }); this.model = Config.GROQ_MODEL; + this.capabilities = { supportsImages: false, supportsStreaming: true }; } else { this.client = new OpenAI({ apiKey: Config.OPENAI_API_KEY }); this.model = Config.OPENAI_MODEL; + this.capabilities = { supportsImages: true, supportsStreaming: true }; } } - async getDecision( - goal: string, - screenContext: string, - _actionHistory: ActionHistoryEntry[] - ): Promise { - // screenContext now includes history, diff, and vision context from kernel - const userContent = `GOAL: ${goal}\n\nSCREEN_CONTEXT:\n${screenContext}`; + private toOpenAIMessages( + messages: ChatMessage[] + ): OpenAI.ChatCompletionMessageParam[] { + return messages.map((msg) => { + if (typeof msg.content === "string") { + return { role: msg.role, content: msg.content } as OpenAI.ChatCompletionMessageParam; + } + // Convert ContentPart[] to OpenAI format + const parts: OpenAI.ChatCompletionContentPart[] = msg.content.map( + (part) => { + if (part.type === "text") { + return { type: "text" as const, text: part.text }; + } + // Image — only for OpenAI (Groq skips images) + if (this.capabilities.supportsImages) { + return { + type: "image_url" as const, + image_url: { + url: `data:${part.mimeType};base64,${part.base64}`, + detail: "low" as const, + }, + }; + } + // Groq: convert image to text placeholder + return { type: "text" as const, text: "[Screenshot attached]" }; + } + ); + return { + role: msg.role, + content: parts, + } as OpenAI.ChatCompletionMessageParam; + }); + } + async getDecision(messages: ChatMessage[]): Promise { + const openaiMessages = this.toOpenAIMessages(messages); const response = await this.client.chat.completions.create({ model: this.model, response_format: { type: "json_object" }, - messages: [ - { role: "system", content: SYSTEM_PROMPT }, - { role: "user", content: userContent }, - ], + messages: openaiMessages, }); + return parseJsonResponse(response.choices[0].message.content ?? "{}"); + } - return JSON.parse(response.choices[0].message.content ?? "{}"); + async *getDecisionStream(messages: ChatMessage[]): AsyncIterable { + const openaiMessages = this.toOpenAIMessages(messages); + const stream = await this.client.chat.completions.create({ + model: this.model, + response_format: { type: "json_object" }, + messages: openaiMessages, + stream: true, + }); + for await (const chunk of stream) { + const content = chunk.choices[0]?.delta?.content; + if (content) yield content; + } } } @@ -176,6 +275,7 @@ class OpenAIProvider implements LLMProvider { class OpenRouterProvider implements LLMProvider { private openrouter: ReturnType; private model: string; + readonly capabilities = { supportsImages: true, supportsStreaming: true }; constructor() { this.openrouter = createOpenRouter({ @@ -184,21 +284,54 @@ class OpenRouterProvider implements LLMProvider { this.model = Config.OPENROUTER_MODEL; } - async getDecision( - goal: string, - screenContext: string, - _actionHistory: ActionHistoryEntry[] - ): Promise { - const userContent = `GOAL: ${goal}\n\nSCREEN_CONTEXT:\n${screenContext}`; + private toVercelMessages(messages: ChatMessage[]) { + // Vercel AI SDK uses a similar format but we need to convert images + const systemMsg = messages.find((m) => m.role === "system"); + const nonSystem = messages.filter((m) => m.role !== "system"); - const result = await generateText({ - model: this.openrouter.chat(this.model), - system: SYSTEM_PROMPT, - prompt: userContent + "\n\nRespond with ONLY a valid JSON object.", + const converted = nonSystem.map((msg) => { + if (typeof msg.content === "string") { + return { role: msg.role as "user" | "assistant", content: msg.content }; + } + const parts = msg.content.map((part) => { + if (part.type === "text") { + return { type: "text" as const, text: part.text }; + } + return { + type: "image" as const, + image: `data:${part.mimeType};base64,${part.base64}`, + }; + }); + return { role: msg.role as "user" | "assistant", content: parts }; }); + return { + system: typeof systemMsg?.content === "string" ? systemMsg.content : "", + messages: converted, + }; + } + + async getDecision(messages: ChatMessage[]): Promise { + const { system, messages: converted } = this.toVercelMessages(messages); + const result = await generateText({ + model: this.openrouter.chat(this.model), + system, + messages: converted as any, + }); return parseJsonResponse(result.text); } + + async *getDecisionStream(messages: ChatMessage[]): AsyncIterable { + const { system, messages: converted } = this.toVercelMessages(messages); + const result = streamText({ + model: this.openrouter.chat(this.model), + system, + messages: converted as any, + }); + for await (const chunk of result.textStream) { + yield chunk; + } + } } // =========================================== @@ -208,32 +341,16 @@ class OpenRouterProvider implements LLMProvider { class BedrockProvider implements LLMProvider { private client: BedrockRuntimeClient; private model: string; + readonly capabilities: { supportsImages: boolean; supportsStreaming: boolean }; constructor() { this.client = new BedrockRuntimeClient({ region: Config.AWS_REGION }); this.model = Config.BEDROCK_MODEL; - } - - async getDecision( - goal: string, - screenContext: string, - _actionHistory: ActionHistoryEntry[] - ): Promise { - const userContent = `GOAL: ${goal}\n\nSCREEN_CONTEXT:\n${screenContext}`; - const requestBody = this.buildRequest(userContent); - - const command = new InvokeModelCommand({ - modelId: this.model, - body: new TextEncoder().encode(requestBody), - contentType: "application/json", - accept: "application/json", - }); - - const response = await this.client.send(command); - const responseBody = JSON.parse(new TextDecoder().decode(response.body)); - const resultText = this.extractResponse(responseBody); - - return parseJsonResponse(resultText); + // Only Anthropic models on Bedrock support images + this.capabilities = { + supportsImages: this.isAnthropicModel(), + supportsStreaming: true, + }; } private isAnthropicModel(): boolean { @@ -246,32 +363,73 @@ class BedrockProvider implements LLMProvider { ); } - private buildRequest(userContent: string): string { + private buildAnthropicMessages(messages: ChatMessage[]) { + const systemMsg = messages.find((m) => m.role === "system"); + const nonSystem = messages.filter((m) => m.role !== "system"); + + const converted = nonSystem.map((msg) => { + if (typeof msg.content === "string") { + return { role: msg.role, content: msg.content }; + } + const parts = msg.content.map((part) => { + if (part.type === "text") { + return { type: "text", text: part.text }; + } + return { + type: "image", + source: { + type: "base64", + media_type: part.mimeType, + data: part.base64, + }, + }; + }); + return { role: msg.role, content: parts }; + }); + + return { + system: typeof systemMsg?.content === "string" ? systemMsg.content : "", + messages: converted, + }; + } + + private buildRequest(messages: ChatMessage[]): string { if (this.isAnthropicModel()) { + const { system, messages: converted } = this.buildAnthropicMessages(messages); return JSON.stringify({ anthropic_version: "bedrock-2023-05-31", max_tokens: 1024, - system: SYSTEM_PROMPT, - messages: [ - { - role: "user", - content: - userContent + "\n\nRespond with ONLY a valid JSON object.", - }, - ], + system, + messages: converted, }); } + // For Meta/other models, flatten to single prompt (no multi-turn / image support) + const systemContent = messages.find((m) => m.role === "system"); + const userMessages = messages + .filter((m) => m.role === "user") + .map((m) => + typeof m.content === "string" + ? m.content + : m.content + .filter((p) => p.type === "text") + .map((p) => (p as { type: "text"; text: string }).text) + .join("\n") + ); + const lastUserContent = userMessages[userMessages.length - 1] ?? ""; + const sysText = + typeof systemContent?.content === "string" ? systemContent.content : ""; + if (this.isMetaModel()) { return JSON.stringify({ - prompt: `<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n${SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n${userContent}\n\nRespond with ONLY a valid JSON object, no other text.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n`, + prompt: `<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n${sysText}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n${lastUserContent}\n\nRespond with ONLY a valid JSON object, no other text.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n`, max_gen_len: 512, temperature: 0.1, }); } return JSON.stringify({ - inputText: `${SYSTEM_PROMPT}\n\n${userContent}\n\nRespond with ONLY a valid JSON object.`, + inputText: `${sysText}\n\n${lastUserContent}\n\nRespond with ONLY a valid JSON object.`, textGenerationConfig: { maxTokenCount: 512, temperature: 0.1, @@ -288,6 +446,56 @@ class BedrockProvider implements LLMProvider { } return responseBody.results[0].outputText; } + + async getDecision(messages: ChatMessage[]): Promise { + const requestBody = this.buildRequest(messages); + const command = new InvokeModelCommand({ + modelId: this.model, + body: new TextEncoder().encode(requestBody), + contentType: "application/json", + accept: "application/json", + }); + + const response = await this.client.send(command); + const responseBody = JSON.parse(new TextDecoder().decode(response.body)); + const resultText = this.extractResponse(responseBody); + return parseJsonResponse(resultText); + } + + async *getDecisionStream(messages: ChatMessage[]): AsyncIterable { + if (!this.isAnthropicModel()) { + // Fallback: non-streaming for non-Anthropic models + const decision = await this.getDecision(messages); + yield JSON.stringify(decision); + return; + } + + const { system, messages: converted } = this.buildAnthropicMessages(messages); + const requestBody = JSON.stringify({ + anthropic_version: "bedrock-2023-05-31", + max_tokens: 1024, + system, + messages: converted, + }); + + const command = new InvokeModelWithResponseStreamCommand({ + modelId: this.model, + body: new TextEncoder().encode(requestBody), + contentType: "application/json", + }); + + const response = await this.client.send(command); + if (response.body) { + for await (const event of response.body) { + if (event.chunk?.bytes) { + const data = JSON.parse(new TextDecoder().decode(event.chunk.bytes)); + if (data.type === "content_block_delta" && data.delta?.text) { + yield data.delta.text; + } + } + } + } + } } // =========================================== diff --git a/android-action-kernel/src/logger.ts b/android-action-kernel/src/logger.ts new file mode 100644 index 0000000..51199c9 --- /dev/null +++ b/android-action-kernel/src/logger.ts @@ -0,0 +1,129 @@ +/** + * Session logging for Android Action Kernel. + * Writes incremental .partial.json after each step (crash-safe), + * and a final .json summary at session end. + */ + +import { mkdirSync, writeFileSync } from "fs"; +import { join } from "path"; +import type { ActionDecision } from "./actions.js"; + +export interface StepLog { + step: number; + timestamp: string; + foregroundApp: string | null; + elementCount: number; + screenChanged: boolean; + llmDecision: { + action: string; + reason?: string; + coordinates?: [number, number]; + text?: string; + think?: string; + plan?: string[]; + planProgress?: string; + }; + actionResult: { + success: boolean; + message: string; + }; + llmLatencyMs: number; + actionLatencyMs: number; +} + +export interface SessionSummary { + sessionId: string; + goal: string; + provider: string; + model: string; + startTime: string; + endTime: string; + totalSteps: number; + successCount: number; + failCount: number; + completed: boolean; + steps: StepLog[]; +} + +export class SessionLogger { + private sessionId: string; + private logDir: string; + private steps: StepLog[] = []; + private goal: string; + private provider: string; + private model: string; + private startTime: string; + + constructor(logDir: string, goal: string, provider: string, model: string) { + this.sessionId = `${Date.now()}-${Math.random().toString(36).slice(2, 8)}`; + this.logDir = logDir; + this.goal = goal; + this.provider = provider; + this.model = model; + this.startTime = new Date().toISOString(); + + mkdirSync(this.logDir, { recursive: true }); + } + + logStep( + step: number, + foregroundApp: string | null, + elementCount: number, + screenChanged: boolean, + decision: ActionDecision, + result: { success: boolean; message: string }, + llmLatencyMs: number, + actionLatencyMs: number + ): void { + const entry: StepLog = { + step, + timestamp: new Date().toISOString(), + foregroundApp, + elementCount, + screenChanged, + llmDecision: { + action: decision.action, + reason: decision.reason, + coordinates: decision.coordinates, + text: decision.text, + think: decision.think, + plan: decision.plan, + planProgress: decision.planProgress, + }, + actionResult: { + success: result.success, + message: result.message, + }, + llmLatencyMs, + actionLatencyMs, + }; + this.steps.push(entry); + + // Write partial file after each step (crash-safe) + const partialPath = join(this.logDir, `${this.sessionId}.partial.json`); + writeFileSync(partialPath, JSON.stringify(this.buildSummary(false), null, 2)); + } + + finalize(completed: boolean): void { + const summary = this.buildSummary(completed); + const finalPath = join(this.logDir, `${this.sessionId}.json`); + writeFileSync(finalPath, JSON.stringify(summary, null, 2)); + console.log(`Session log saved: ${finalPath}`); + } + + private buildSummary(completed: boolean): SessionSummary { + return { + sessionId: this.sessionId, + goal: this.goal, + provider: this.provider, + model: this.model, + startTime: this.startTime, + endTime: new Date().toISOString(), + totalSteps: this.steps.length, + successCount: this.steps.filter((s) => s.actionResult.success).length, + failCount: this.steps.filter((s) => !s.actionResult.success).length, + completed, + steps: this.steps, + }; + } +} diff --git a/android-action-kernel/src/sanitizer.ts b/android-action-kernel/src/sanitizer.ts index 684b009..2ec1fed 100644 --- a/android-action-kernel/src/sanitizer.ts +++ b/android-action-kernel/src/sanitizer.ts @@ -169,3 +169,81 @@ export function getInteractiveElements(xmlContent: string): UIElement[] { walk(parsed, "root", 0); return elements; } + +// =========================================== +// Smart Element Filtering (Phase 2A) +// =========================================== + +/** + * Compact representation sent to the LLM — only essential fields. + * Non-default flags are included conditionally to minimize tokens. + */ +export interface CompactUIElement { + text: string; + center: [number, number]; + action: UIElement["action"]; + // Only included when non-default + enabled?: false; + checked?: true; + focused?: true; + hint?: string; + editable?: true; + scrollable?: true; +} + +/** + * Strips a full UIElement to its compact form, omitting default-valued flags. + */ +export function compactElement(el: UIElement): CompactUIElement { + const compact: CompactUIElement = { + text: el.text, + center: el.center, + action: el.action, + }; + if (!el.enabled) compact.enabled = false; + if (el.checked) compact.checked = true; + if (el.focused) compact.focused = true; + if (el.hint) compact.hint = el.hint; + if (el.editable) compact.editable = true; + if (el.scrollable) compact.scrollable = true; + return compact; +} + +/** + * Scores an element for relevance to the LLM. + */ +function scoreElement(el: UIElement): number { + let score = 0; + if (el.enabled) score += 10; + if (el.editable) score += 8; + if (el.focused) score += 6; + if (el.clickable || el.longClickable) score += 5; + if (el.text) score += 3; + return score; +} + +/** + * Deduplicates elements by center coordinates (within tolerance), + * scores them, and returns the top N as compact elements. + */ +export function filterElements( + elements: UIElement[], + limit: number +): CompactUIElement[] { + // Deduplicate by center coordinates (5px tolerance) + const seen = new Map(); + for (const el of elements) { + const bucketX = Math.round(el.center[0] / 5) * 5; + const bucketY = Math.round(el.center[1] / 5) * 5; + const key = `${bucketX},${bucketY}`; + const existing = seen.get(key); + if (!existing || scoreElement(el) > scoreElement(existing)) { + seen.set(key, el); + } + } + + // Score, sort descending, take top N + const deduped = Array.from(seen.values()); + deduped.sort((a, b) => scoreElement(b) - scoreElement(a)); + return deduped.slice(0, limit).map(compactElement); +}