From 792b42974ff7ae6c9550624c717d47b22671ab64 Mon Sep 17 00:00:00 2001 From: Sanju Sivalingam Date: Wed, 18 Feb 2026 00:58:59 +0530 Subject: [PATCH] feat(agent): implement server-side multi-step skills Skills (copy_visible_text, find_and_tap, submit_message, read_screen, wait_for_content, compose_email) were CLI-only using direct ADB. The server prompt advertised them but they silently failed when chosen. Now intercepted in the agent loop before actionToCommand() and executed server-side using existing WebSocket primitives (get_screen, tap, swipe, clipboard_set). Each skill replaces 3-8 LLM calls with deterministic server-side logic. --- server/src/agent/loop.ts | 33 ++- server/src/agent/skills.ts | 470 +++++++++++++++++++++++++++++++++++++ 2 files changed, 493 insertions(+), 10 deletions(-) create mode 100644 server/src/agent/skills.ts diff --git a/server/src/agent/loop.ts b/server/src/agent/loop.ts index 8f80dc4..a39d6c6 100644 --- a/server/src/agent/loop.ts +++ b/server/src/agent/loop.ts @@ -26,6 +26,7 @@ import { type LLMConfig, } from "./llm.js"; import { formatAppHints } from "./hints.js"; +import { isSkillAction, executeSkill } from "./skills.js"; import { createStuckDetector } from "./stuck.js"; import { db } from "../db.js"; import { agentSession, agentStep, device as deviceTable } from "../schema.js"; @@ -568,20 +569,32 @@ export async function runAgentLoop( ); } - // ── 9. Execute on device ──────────────────────────────── - const command = actionToCommand(action); + // ── 9. Execute on device (skills intercepted server-side) ── try { - const result = (await sessions.sendCommand(deviceId, command)) as { - success?: boolean; - error?: string; - data?: string; - }; - const resultSuccess = result.success !== false; - lastActionFeedback = `${actionSig} -> ${resultSuccess ? "OK" : "FAILED"}: ${result.error ?? result.data ?? "completed"}`; + if (isSkillAction(action.action)) { + // Multi-step skill: run server-side using WebSocket primitives + const skillResult = await executeSkill( + deviceId, + action as unknown as Record & { action: string }, + elements + ); + lastActionFeedback = `${actionSig} -> ${skillResult.success ? "OK" : "FAILED"}: ${skillResult.message}`; + } else { + // Regular action: map to WebSocket command and send to device + const command = actionToCommand(action); + const result = (await sessions.sendCommand(deviceId, command)) as { + success?: boolean; + error?: string; + data?: string; + }; + const resultSuccess = result.success !== false; + lastActionFeedback = `${actionSig} -> ${resultSuccess ? "OK" : "FAILED"}: ${result.error ?? result.data ?? "completed"}`; + } console.log(`[Agent ${sessionId}] Step ${step + 1} result: ${lastActionFeedback}`); // Append result to last history entry if (actionHistory.length > 0) { - actionHistory[actionHistory.length - 1] += ` → ${resultSuccess ? "OK" : "FAILED"}`; + const ok = lastActionFeedback.includes("-> OK"); + actionHistory[actionHistory.length - 1] += ` → ${ok ? "OK" : "FAILED"}`; } // Update step result in DB if (persistentDeviceId) { diff --git a/server/src/agent/skills.ts b/server/src/agent/skills.ts new file mode 100644 index 0000000..9868cf0 --- /dev/null +++ b/server/src/agent/skills.ts @@ -0,0 +1,470 @@ +/** + * Server-side multi-step skills for the DroidClaw agent loop. + * + * These replace 3-8 LLM calls with deterministic server-side logic. + * Each skill uses sessions.sendCommand() to interact with the device + * via WebSocket — no direct ADB needed. + * + * Skills: + * copy_visible_text — Extract text from screen elements, set clipboard + * find_and_tap — Search elements by text, scroll if needed, tap + * submit_message — Find Send/Submit button and tap it + * read_screen — Scroll through page, collect all text, set clipboard + * wait_for_content — Poll for new content to appear + * compose_email — Launch mailto: intent, paste body + */ + +import { sessions } from "../ws/sessions.js"; +import type { UIElement } from "@droidclaw/shared"; + +// ─── Types ────────────────────────────────────────────────────── + +export interface SkillResult { + success: boolean; + message: string; + data?: string; +} + +interface SkillAction { + action: string; + query?: string; + text?: string; + [key: string]: unknown; +} + +// ─── Skill Registry ───────────────────────────────────────────── + +const SKILL_ACTIONS = new Set([ + "copy_visible_text", + "find_and_tap", + "submit_message", + "read_screen", + "wait_for_content", + "compose_email", +]); + +export function isSkillAction(action: string): boolean { + return SKILL_ACTIONS.has(action); +} + +/** + * Execute a multi-step skill server-side. + * Returns null if the action is not a skill (caller should handle normally). + */ +export async function executeSkill( + deviceId: string, + action: SkillAction, + currentElements: UIElement[] +): Promise { + switch (action.action) { + case "copy_visible_text": + return copyVisibleText(deviceId, action, currentElements); + case "find_and_tap": + return findAndTap(deviceId, action, currentElements); + case "submit_message": + return submitMessage(deviceId, currentElements); + case "read_screen": + return readScreen(deviceId, currentElements); + case "wait_for_content": + return waitForContent(deviceId, currentElements); + case "compose_email": + return composeEmail(deviceId, action); + default: + return { success: false, message: `Unknown skill: ${action.action}` }; + } +} + +// ─── Helpers ──────────────────────────────────────────────────── + +async function getScreen( + deviceId: string +): Promise<{ elements: UIElement[]; packageName?: string }> { + try { + const res = (await sessions.sendCommand(deviceId, { + type: "get_screen", + })) as { elements?: UIElement[]; packageName?: string }; + return { elements: res.elements ?? [], packageName: res.packageName }; + } catch { + return { elements: [] }; + } +} + +async function tap(deviceId: string, x: number, y: number): Promise { + await sessions.sendCommand(deviceId, { type: "tap", x, y }); +} + +async function swipeDown(deviceId: string): Promise { + // Scroll down = swipe from bottom to top (1080px wide screen defaults) + await sessions.sendCommand(deviceId, { + type: "swipe", + x1: 540, y1: 1600, x2: 540, y2: 400, + }); +} + +async function clipboardSet(deviceId: string, text: string): Promise { + await sessions.sendCommand(deviceId, { type: "clipboard_set", text }); +} + +function sleep(ms: number): Promise { + return new Promise((r) => setTimeout(r, ms)); +} + +function findMatch( + elements: UIElement[], + queryLower: string +): UIElement | null { + const matches = elements.filter( + (el) => el.text && el.text.toLowerCase().includes(queryLower) + ); + if (matches.length === 0) return null; + + const scored = matches.map((el) => { + let score = 0; + if (el.enabled) score += 10; + if (el.clickable || el.longClickable) score += 5; + if (el.text.toLowerCase() === queryLower) score += 20; + else score += 5; + return { el, score }; + }); + scored.sort((a, b) => b.score - a.score); + return scored[0].el; +} + +// ─── Skill: copy_visible_text ─────────────────────────────────── + +async function copyVisibleText( + deviceId: string, + action: SkillAction, + elements: UIElement[] +): Promise { + // 1. Filter for readable text elements + let textElements = elements.filter((el) => el.text && el.action === "read"); + + // 2. If query provided, filter to matching elements + if (action.query) { + const query = action.query.toLowerCase(); + textElements = textElements.filter((el) => + el.text.toLowerCase().includes(query) + ); + } + + // Fallback: include all elements with text + if (textElements.length === 0) { + textElements = elements.filter((el) => el.text); + if (action.query) { + const query = action.query.toLowerCase(); + textElements = textElements.filter((el) => + el.text.toLowerCase().includes(query) + ); + } + } + + if (textElements.length === 0) { + return { + success: false, + message: action.query + ? `No text matching "${action.query}" found on screen` + : "No readable text found on screen", + }; + } + + // 3. Sort by vertical position (top to bottom) + textElements.sort((a, b) => a.center[1] - b.center[1]); + + // 4. Concatenate and set clipboard + const combinedText = textElements.map((el) => el.text).join("\n"); + await clipboardSet(deviceId, combinedText); + + return { + success: true, + message: `Copied ${textElements.length} text elements to clipboard (${combinedText.length} chars)`, + data: combinedText.slice(0, 200), + }; +} + +// ─── Skill: find_and_tap ──────────────────────────────────────── + +async function findAndTap( + deviceId: string, + action: SkillAction, + elements: UIElement[] +): Promise { + const query = action.query; + if (!query) { + return { success: false, message: "find_and_tap requires a query" }; + } + + const queryLower = query.toLowerCase(); + + // 1. Check current screen + let best = findMatch(elements, queryLower); + + // 2. If not found, scroll down and re-check (up to 8 scrolls) + if (!best) { + const maxScrolls = 8; + for (let i = 0; i < maxScrolls; i++) { + console.log( + `[Skill] find_and_tap: "${query}" not visible, scrolling down (${i + 1}/${maxScrolls})` + ); + await swipeDown(deviceId); + await sleep(1200); + + const { elements: freshElements } = await getScreen(deviceId); + best = findMatch(freshElements, queryLower); + if (best) { + console.log( + `[Skill] find_and_tap: Found "${query}" after ${i + 1} scroll(s)` + ); + break; + } + } + } + + if (!best) { + const available = elements + .filter((el) => el.text) + .map((el) => el.text) + .slice(0, 10); + return { + success: false, + message: `No element matching "${query}" found after scrolling. Visible: ${available.join(", ")}`, + }; + } + + // 3. Tap it + const [x, y] = best.center; + console.log(`[Skill] find_and_tap: Tapping "${best.text}" at (${x}, ${y})`); + await tap(deviceId, x, y); + + return { + success: true, + message: `Found and tapped "${best.text}" at (${x}, ${y})`, + data: best.text, + }; +} + +// ─── Skill: submit_message ────────────────────────────────────── + +const SEND_BUTTON_PATTERN = /send|submit|post|arrow|paper.?plane/i; + +async function submitMessage( + deviceId: string, + elements: UIElement[] +): Promise { + // 1. Search for Send/Submit button by text or ID + let candidates = elements.filter( + (el) => + el.enabled && + (el.clickable || el.action === "tap") && + (SEND_BUTTON_PATTERN.test(el.text) || SEND_BUTTON_PATTERN.test(el.id)) + ); + + // 2. Fallback: clickable elements in bottom 20%, prefer rightmost + if (candidates.length === 0) { + const clickable = elements + .filter((el) => el.enabled && el.clickable) + .sort((a, b) => b.center[1] - a.center[1]); + + if (clickable.length > 0) { + const maxY = clickable[0].center[1]; + const threshold = maxY * 0.8; + candidates = clickable.filter((el) => el.center[1] >= threshold); + candidates.sort((a, b) => b.center[0] - a.center[0]); + } + } + + if (candidates.length === 0) { + return { + success: false, + message: "Could not find a Send/Submit button on screen", + }; + } + + // 3. Tap the best match + const target = candidates[0]; + const [x, y] = target.center; + console.log( + `[Skill] submit_message: Tapping "${target.text}" at (${x}, ${y})` + ); + await tap(deviceId, x, y); + + // 4. Wait for response + await sleep(4000); + + // 5. Check for new content + const { elements: newElements } = await getScreen(deviceId); + const originalTexts = new Set( + elements.map((el) => el.text).filter(Boolean) + ); + const newTexts = newElements + .map((el) => el.text) + .filter((t) => t && !originalTexts.has(t)); + + if (newTexts.length > 0) { + const summary = newTexts.slice(0, 3).join("; "); + return { + success: true, + message: `Tapped "${target.text}" — new content: ${summary}`, + data: summary, + }; + } + + return { + success: true, + message: `Tapped "${target.text}" at (${x}, ${y}). No new content yet — may still be loading.`, + }; +} + +// ─── Skill: read_screen ───────────────────────────────────────── + +async function readScreen( + deviceId: string, + elements: UIElement[] +): Promise { + const allTexts: string[] = []; + const seenTexts = new Set(); + + function collectTexts(els: UIElement[]): number { + let added = 0; + for (const el of els) { + if (el.text && !seenTexts.has(el.text)) { + seenTexts.add(el.text); + allTexts.push(el.text); + added++; + } + } + return added; + } + + // 1. Collect from initial screen + collectTexts(elements); + + // 2. Scroll down and collect until no new content + const maxScrolls = 5; + let scrollsDone = 0; + + for (let i = 0; i < maxScrolls; i++) { + await swipeDown(deviceId); + await sleep(1200); + scrollsDone++; + + const { elements: newElements } = await getScreen(deviceId); + const added = collectTexts(newElements); + console.log( + `[Skill] read_screen: Scroll ${scrollsDone} — found ${added} new text elements` + ); + + if (added === 0) break; + } + + const combinedText = allTexts.join("\n"); + + // 3. Copy to clipboard + if (combinedText.length > 0) { + await clipboardSet(deviceId, combinedText); + } + + return { + success: true, + message: `Read ${allTexts.length} text elements across ${scrollsDone} scrolls (${combinedText.length} chars), copied to clipboard`, + data: combinedText.slice(0, 300), + }; +} + +// ─── Skill: wait_for_content ──────────────────────────────────── + +async function waitForContent( + deviceId: string, + elements: UIElement[] +): Promise { + const originalTexts = new Set( + elements.map((el) => el.text).filter(Boolean) + ); + + // Poll up to 5 times (3s intervals = 15s max) + for (let i = 0; i < 5; i++) { + console.log( + `[Skill] wait_for_content: Waiting 3s... (attempt ${i + 1}/5)` + ); + await sleep(3000); + + const { elements: newElements } = await getScreen(deviceId); + const newTexts = newElements + .map((el) => el.text) + .filter((t) => t && !originalTexts.has(t)); + + const totalNewChars = newTexts.reduce((sum, t) => sum + t.length, 0); + if (totalNewChars > 20) { + const summary = newTexts.slice(0, 5).join("; "); + return { + success: true, + message: `New content appeared after ${(i + 1) * 3}s: ${summary}`, + data: summary, + }; + } + } + + return { + success: false, + message: "No new content appeared after 15s", + }; +} + +// ─── Skill: compose_email ─────────────────────────────────────── + +async function composeEmail( + deviceId: string, + action: SkillAction +): Promise { + const emailAddress = action.query; + const bodyContent = action.text; + + if (!emailAddress) { + return { + success: false, + message: + 'compose_email requires query (email address). Example: {"action": "compose_email", "query": "user@example.com"}', + }; + } + + // 1. Launch mailto: intent + console.log(`[Skill] compose_email: Launching mailto:${emailAddress}`); + await sessions.sendCommand(deviceId, { + type: "intent", + intentAction: "android.intent.action.SENDTO", + intentUri: `mailto:${emailAddress}`, + }); + await sleep(2500); + + // 2. Find body field and paste content + const { elements } = await getScreen(deviceId); + const editables = elements + .filter((el) => el.editable && el.enabled) + .sort((a, b) => a.center[1] - b.center[1]); + + if (editables.length === 0) { + return { + success: false, + message: "Launched email compose but no editable fields appeared", + }; + } + + // Body is typically the last/largest editable field + const bodyField = editables[editables.length - 1]; + const [bx, by] = bodyField.center; + console.log(`[Skill] compose_email: Tapping Body field at (${bx}, ${by})`); + await tap(deviceId, bx, by); + await sleep(300); + + // Set clipboard with body content and paste + if (bodyContent) { + await clipboardSet(deviceId, bodyContent); + await sleep(200); + } + await sessions.sendCommand(deviceId, { type: "paste" }); + + return { + success: true, + message: `Email compose opened to ${emailAddress}, body pasted`, + }; +}