/** * Skills module for DroidClaw. * Multi-step smart actions that reduce LLM decision points and eliminate * entire categories of errors (coordinate guessing, wrong submit buttons, etc.) * * Skills: * submit_message — Find and tap the Send/Submit button in chat apps * copy_visible_text — Read text from screen elements and set clipboard programmatically * wait_for_content — Wait for new content to appear (AI responses, page loads) * find_and_tap — Find an element by text label and tap it * compose_email — Fill email fields in correct order (To, Subject, Body) */ import { existsSync, readFileSync } from "fs"; import { Config } from "./config.js"; import { runAdbCommand, getSwipeCoords, type ActionDecision, type ActionResult } from "./actions.js"; import { getInteractiveElements, type UIElement } from "./sanitizer.js"; import { SWIPE_DURATION_MS } from "./constants.js"; /** * Routes a skill action to the appropriate skill function. */ export function executeSkill( decision: ActionDecision, elements: UIElement[] ): ActionResult { const skill = decision.skill ?? decision.action; console.log(`Executing multi-step action: ${skill}`); switch (skill) { case "read_screen": return readScreen(elements); case "submit_message": return submitMessage(elements); case "copy_visible_text": return copyVisibleText(decision, elements); case "wait_for_content": return waitForContent(elements); case "find_and_tap": return findAndTap(decision, elements); case "compose_email": return composeEmail(decision, elements); default: return { success: false, message: `Unknown skill: ${skill}` }; } } // =========================================== // Helper: re-scan screen // =========================================== /** * Sets clipboard text via ADB with proper shell escaping. * ADB shell joins args into a single string, so parentheses/quotes break it. * Wrapping in single quotes and escaping internal quotes fixes this. */ function safeClipboardSet(text: string): void { const escaped = text.replaceAll("'", "'\\''"); runAdbCommand(["shell", `cmd clipboard set-text '${escaped}'`]); } function rescanScreen(): UIElement[] { try { runAdbCommand(["shell", "uiautomator", "dump", Config.SCREEN_DUMP_PATH]); runAdbCommand(["pull", Config.SCREEN_DUMP_PATH, Config.LOCAL_DUMP_PATH]); } catch { console.log("Warning: ADB screen capture failed during skill re-scan."); return []; } if (!existsSync(Config.LOCAL_DUMP_PATH)) return []; const xmlContent = readFileSync(Config.LOCAL_DUMP_PATH, "utf-8"); return getInteractiveElements(xmlContent); } // =========================================== // Skill 0: read_screen (scroll + collect all text) // =========================================== function readScreen(elements: UIElement[]): ActionResult { const allTexts: string[] = []; const seenTexts = new Set(); function collectTexts(els: UIElement[]): number { let added = 0; for (const el of els) { if (el.text && !seenTexts.has(el.text)) { seenTexts.add(el.text); allTexts.push(el.text); added++; } } return added; } // 1. Collect from initial screen collectTexts(elements); // 2. Scroll down and collect until no new content const swipeCoords = getSwipeCoords(); const upCoords = swipeCoords["up"]; // swipe up = scroll down = see more below const maxScrolls = 5; let scrollsDone = 0; for (let i = 0; i < maxScrolls; i++) { runAdbCommand([ "shell", "input", "swipe", String(upCoords[0]), String(upCoords[1]), String(upCoords[2]), String(upCoords[3]), SWIPE_DURATION_MS, ]); Bun.sleepSync(1500); scrollsDone++; const newElements = rescanScreen(); const added = collectTexts(newElements); console.log(`read_screen: Scroll ${scrollsDone} — found ${added} new text elements`); if (added === 0) break; } const combinedText = allTexts.join("\n"); // 3. Copy to clipboard for easy access if (combinedText.length > 0) { safeClipboardSet(combinedText); } return { success: true, message: `Read ${allTexts.length} text elements across ${scrollsDone} scrolls (${combinedText.length} chars), copied to clipboard`, data: combinedText, }; } // =========================================== // Skill 1: submit_message // =========================================== const SEND_BUTTON_PATTERN = /send|submit|post|arrow|paper.?plane/i; function submitMessage(elements: UIElement[]): ActionResult { // 1. Search for Send/Submit button by text let candidates = elements.filter( (el) => el.enabled && (el.clickable || el.action === "tap") && (SEND_BUTTON_PATTERN.test(el.text) || SEND_BUTTON_PATTERN.test(el.id)) ); // 2. If no text match, look for clickable elements in the bottom 20% of screen // near the right side (common Send button position) if (candidates.length === 0) { const screenBottom = elements .filter((el) => el.enabled && el.clickable) .sort((a, b) => b.center[1] - a.center[1]); // Take elements in the bottom 20% by Y coordinate if (screenBottom.length > 0) { const maxY = screenBottom[0].center[1]; const threshold = maxY * 0.8; candidates = screenBottom.filter((el) => el.center[1] >= threshold); // Prefer rightmost element (Send buttons are usually on the right) candidates.sort((a, b) => b.center[0] - a.center[0]); } } if (candidates.length === 0) { return { success: false, message: "Could not find a Send/Submit button on screen", }; } // 3. Tap the best match const target = candidates[0]; const [x, y] = target.center; console.log( `submit_message: Tapping "${target.text}" at (${x}, ${y})` ); runAdbCommand(["shell", "input", "tap", String(x), String(y)]); // 4. Wait for response to generate console.log("submit_message: Waiting 6s for response..."); Bun.sleepSync(6000); // 5. Re-scan screen and check for new content const newElements = rescanScreen(); const originalTexts = new Set(elements.map((el) => el.text).filter(Boolean)); const newTexts = newElements .map((el) => el.text) .filter((t) => t && !originalTexts.has(t)); if (newTexts.length > 0) { const summary = newTexts.slice(0, 3).join("; "); return { success: true, message: `Tapped "${target.text}" and new content appeared: ${summary}`, data: summary, }; } return { success: true, message: `Tapped "${target.text}" at (${x}, ${y}). No new content yet — may still be loading.`, }; } // =========================================== // Skill 2: copy_visible_text // =========================================== function copyVisibleText( decision: ActionDecision, elements: UIElement[] ): ActionResult { // 1. Filter for readable text elements let textElements = elements.filter( (el) => el.text && el.action === "read" ); // 2. If query provided, filter to matching elements if (decision.query) { const query = decision.query.toLowerCase(); textElements = textElements.filter((el) => el.text.toLowerCase().includes(query) ); } // If no read-only text, include all elements with text if (textElements.length === 0) { textElements = elements.filter((el) => el.text); if (decision.query) { const query = decision.query.toLowerCase(); textElements = textElements.filter((el) => el.text.toLowerCase().includes(query) ); } } if (textElements.length === 0) { return { success: false, message: decision.query ? `No text matching "${decision.query}" found on screen` : "No readable text found on screen", }; } // 3. Sort by vertical position (top to bottom) textElements.sort((a, b) => a.center[1] - b.center[1]); // 4. Concatenate text const combinedText = textElements.map((el) => el.text).join("\n"); // 5. Set clipboard programmatically console.log( `copy_visible_text: Copying ${textElements.length} text elements (${combinedText.length} chars)` ); safeClipboardSet(combinedText); return { success: true, message: `Copied ${textElements.length} text elements to clipboard (${combinedText.length} chars)`, data: combinedText, }; } // =========================================== // Skill 3: wait_for_content // =========================================== function waitForContent(elements: UIElement[]): ActionResult { // 1. Record current element texts const originalTexts = new Set(elements.map((el) => el.text).filter(Boolean)); // 2. Poll up to 5 times (3s intervals = 15s max) for (let i = 0; i < 5; i++) { console.log( `wait_for_content: Waiting 3s... (attempt ${i + 1}/5)` ); Bun.sleepSync(3000); // Re-scan screen const newElements = rescanScreen(); const newTexts = newElements .map((el) => el.text) .filter((t) => t && !originalTexts.has(t)); // Check if meaningful new content appeared (>20 chars total) const totalNewChars = newTexts.reduce((sum, t) => sum + t.length, 0); if (totalNewChars > 20) { const summary = newTexts.slice(0, 5).join("; "); console.log( `wait_for_content: Found ${newTexts.length} new text elements (${totalNewChars} chars)` ); return { success: true, message: `New content appeared after ${(i + 1) * 3}s: ${summary}`, data: summary, }; } } return { success: false, message: "No new content appeared after 15s", }; } // =========================================== // Skill 4: find_and_tap // =========================================== function findAndTap( decision: ActionDecision, elements: UIElement[] ): ActionResult { const query = decision.query; if (!query) { return { success: false, message: "find_and_tap requires a query" }; } const queryLower = query.toLowerCase(); // 1. Search elements for text matching query const matches = elements.filter( (el) => el.text && el.text.toLowerCase().includes(queryLower) ); if (matches.length === 0) { // Return available element texts to help the LLM const available = elements .filter((el) => el.text) .map((el) => el.text) .slice(0, 15); return { success: false, message: `No element matching "${query}" found. Available: ${available.join(", ")}`, }; } // 2. Score matches const scored = matches.map((el) => { let score = 0; if (el.enabled) score += 10; if (el.clickable || el.longClickable) score += 5; if (el.text.toLowerCase() === queryLower) score += 20; // exact match else score += 5; // partial match return { el, score }; }); // 3. Pick highest-scoring match scored.sort((a, b) => b.score - a.score); const best = scored[0].el; const [x, y] = best.center; // 4. Tap it console.log( `find_and_tap: Tapping "${best.text}" at (${x}, ${y}) [score: ${scored[0].score}]` ); runAdbCommand(["shell", "input", "tap", String(x), String(y)]); return { success: true, message: `Found and tapped "${best.text}" at (${x}, ${y})`, data: best.text, }; } // =========================================== // Skill 5: compose_email // =========================================== /** Patterns to identify email compose fields by resource ID */ const TO_FIELD_PATTERN = /to|recipient/i; const SUBJECT_FIELD_PATTERN = /subject/i; const BODY_FIELD_PATTERN = /body|compose_area|compose_edit|message_content/i; /** Patterns to identify fields by hint text */ const TO_HINT_PATTERN = /^to$|recipient|email.?address/i; const SUBJECT_HINT_PATTERN = /subject/i; const BODY_HINT_PATTERN = /compose|body|message|write/i; /** * Finds an editable field matching the given ID and hint patterns. * Falls back to positional matching if patterns don't match. */ function findEmailField( editables: UIElement[], idPattern: RegExp, hintPattern: RegExp ): UIElement | undefined { // Try resource ID first (most reliable) const byId = editables.find((el) => idPattern.test(el.id)); if (byId) return byId; // Try hint text const byHint = editables.find((el) => el.hint && hintPattern.test(el.hint)); if (byHint) return byHint; // Try visible label/text const byText = editables.find((el) => idPattern.test(el.text)); if (byText) return byText; return undefined; } /** Try to extract an email address from a string */ function extractEmail(text: string): string | null { const match = text.match(/[\w.+-]+@[\w.-]+\.\w{2,}/); return match ? match[0] : null; } function composeEmail( decision: ActionDecision, elements: UIElement[] ): ActionResult { // Resolve email address: try query first, then extract from text let emailAddress = decision.query; const bodyContent = decision.text; if (!emailAddress && bodyContent) { const extracted = extractEmail(bodyContent); if (extracted) { emailAddress = extracted; console.log(`compose_email: Extracted email "${emailAddress}" from text field`); } } if (!emailAddress) { return { success: false, message: "compose_email requires query (email address). Example: {\"action\": \"compose_email\", \"query\": \"user@example.com\"}", }; } // Always use mailto: intent — this is the most reliable path. // It opens the default email app with To pre-filled, regardless of current screen. console.log(`compose_email: Launching mailto:${emailAddress}`); runAdbCommand([ "shell", "am", "start", "-a", "android.intent.action.SENDTO", "-d", `mailto:${emailAddress}`, ]); Bun.sleepSync(2500); // Re-scan to find the compose screen const freshElements = rescanScreen(); const editables = freshElements .filter((el) => el.editable && el.enabled) .sort((a, b) => a.center[1] - b.center[1]); if (editables.length === 0) { return { success: false, message: "Launched email compose but no editable fields appeared" }; } // Find the body field — mailto: already handled the To field let bodyField = findEmailField(editables, BODY_FIELD_PATTERN, BODY_HINT_PATTERN); if (!bodyField) { // Positional fallback: body is the last/largest editable field bodyField = editables[editables.length - 1]; } const [bx, by] = bodyField.center; console.log(`compose_email: Tapping Body field at (${bx}, ${by})`); runAdbCommand(["shell", "input", "tap", String(bx), String(by)]); Bun.sleepSync(300); // Paste body content — use explicit text if provided, otherwise paste clipboard if (bodyContent) { safeClipboardSet(bodyContent); Bun.sleepSync(200); } runAdbCommand(["shell", "input", "keyevent", "279"]); // KEYCODE_PASTE return { success: true, message: `Email compose opened to ${emailAddress}, body pasted`, }; }