diff --git a/examples/logistics-workflow.json b/examples/logistics-workflow.json new file mode 100644 index 0000000..c270f1e --- /dev/null +++ b/examples/logistics-workflow.json @@ -0,0 +1,21 @@ +{ + "name": "Logistics Agent v2.1", + "steps": [ + { + "app": "com.whatsapp", + "goal": "Find the latest Bill of Lading image and save it" + }, + { + "app": "com.intsig.camscanner", + "goal": "Crop and enhance the latest image" + }, + { + "app": "com.rtspro.factoring", + "goal": "Fill out the invoice submission form", + "formData": { + "Invoice": "#9921", + "Amount": "$4,200.00" + } + } + ] +} diff --git a/src/actions.ts b/src/actions.ts index d565441..478142f 100644 --- a/src/actions.ts +++ b/src/actions.ts @@ -2,10 +2,11 @@ * Action execution module for DroidClaw. * Handles all ADB commands for interacting with Android devices. * - * Supported actions (21): + * Supported actions (28): * tap, type, enter, swipe, home, back, wait, done, * longpress, screenshot, launch, clear, clipboard_get, clipboard_set, paste, shell, - * submit_message, copy_visible_text, wait_for_content, find_and_tap, compose_email + * submit_message, copy_visible_text, wait_for_content, find_and_tap, compose_email, + * open_url, switch_app, notifications, pull_file, push_file, keyevent, open_settings */ import { Config } from "./config.js"; @@ -47,6 +48,17 @@ export interface ActionDecision { // multi-step action fields (Phase 6) skill?: string; // legacy: kept for backward compat, prefer action field directly query?: string; // email address for compose_email, search term for find_and_tap/copy_visible_text + // open_url action + url?: string; + // pull_file action + path?: string; + // push_file action + source?: string; + dest?: string; + // keyevent action + code?: number; + // open_settings action + setting?: string; } export interface ActionResult { @@ -186,6 +198,20 @@ export function executeAction(action: ActionDecision): ActionResult { return executeShell(action); case "scroll": return executeScroll(action); + case "open_url": + return executeOpenUrl(action); + case "switch_app": + return executeSwitchApp(action); + case "notifications": + return executeNotifications(); + case "pull_file": + return executePullFile(action); + case "push_file": + return executePushFile(action); + case "keyevent": + return executeKeyevent(action); + case "open_settings": + return executeOpenSettings(action); default: console.log(`Warning: Unknown action: ${action.action}`); return { success: false, message: `Unknown action: ${action.action}` }; @@ -301,17 +327,26 @@ function executeType(action: ActionDecision): ActionResult { } } - // ADB requires %s for spaces, escape special shell characters + // ADB requires %s for spaces, escape special shell characters. + // Backslash must be escaped first to avoid double-escaping. const escapedText = text .replaceAll("\\", "\\\\") .replaceAll("\"", "\\\"") .replaceAll("'", "\\'") + .replaceAll("`", "\\`") + .replaceAll("$", "\\$") + .replaceAll("!", "\\!") + .replaceAll("?", "\\?") .replaceAll(" ", "%s") .replaceAll("&", "\\&") .replaceAll("|", "\\|") .replaceAll(";", "\\;") .replaceAll("(", "\\(") .replaceAll(")", "\\)") + .replaceAll("[", "\\[") + .replaceAll("]", "\\]") + .replaceAll("{", "\\{") + .replaceAll("}", "\\}") .replaceAll("<", "\\<") .replaceAll(">", "\\>"); console.log(`Typing: ${text}`); @@ -483,7 +518,10 @@ function executeClipboardSet(action: ActionDecision): ActionResult { const text = action.text ?? ""; if (!text) return { success: false, message: "No text to set on clipboard" }; console.log(`Setting clipboard: ${text.slice(0, 50)}...`); - runAdbCommand(["shell", "cmd", "clipboard", "set-text", text]); + // Safe shell escaping: wrap in single quotes, escape internal ' as '\'' + // This matches safeClipboardSet() in skills.ts + const escaped = text.replaceAll("'", "'\\''"); + runAdbCommand(["shell", `cmd clipboard set-text '${escaped}'`]); return { success: true, message: `Clipboard set to "${text.slice(0, 50)}"` }; } @@ -533,6 +571,128 @@ function executeScroll(action: ActionDecision): ActionResult { return { success: true, message: `Scrolled ${direction}` }; } +// =========================================== +// Phase 7: New actions +// =========================================== + +/** + * Opens a URL in the default browser. + */ +function executeOpenUrl(action: ActionDecision): ActionResult { + const url = action.url ?? ""; + if (!url) return { success: false, message: "No URL provided" }; + console.log(`Opening URL: ${url}`); + const result = runAdbCommand(["shell", "am", "start", "-a", "android.intent.action.VIEW", "-d", url]); + return { success: true, message: `Opened URL: ${url}`, data: result }; +} + +/** + * Switches to a specific app by package name. + */ +function executeSwitchApp(action: ActionDecision): ActionResult { + const pkg = action.package ?? ""; + if (!pkg) return { success: false, message: "No package name provided" }; + console.log(`Switching to app: ${pkg}`); + const result = runAdbCommand([ + "shell", "monkey", "-p", pkg, "-c", "android.intent.category.LAUNCHER", "1", + ]); + return { success: true, message: `Switched to ${pkg}`, data: result }; +} + +/** + * Reads notification bar content. Parses title/text from active notifications. + */ +function executeNotifications(): ActionResult { + console.log("Reading notifications"); + const raw = runAdbCommand(["shell", "dumpsys", "notification", "--noredact"]); + // Parse title and text from NotificationRecord sections + const notifications: string[] = []; + let currentTitle = ""; + for (const line of raw.split("\n")) { + const titleMatch = line.match(/android\.title=(?:String\s*\()?(.*?)(?:\)|$)/); + const textMatch = line.match(/android\.text=(?:String\s*\()?(.*?)(?:\)|$)/); + if (titleMatch) currentTitle = titleMatch[1].trim(); + if (textMatch && currentTitle) { + notifications.push(`${currentTitle}: ${textMatch[1].trim()}`); + currentTitle = ""; + } + } + const summary = notifications.length > 0 + ? notifications.join("\n") + : "No notifications found"; + console.log(`Found ${notifications.length} notifications`); + return { success: true, message: `Notifications:\n${summary}`, data: summary }; +} + +/** + * Pulls a file from device to local machine. + */ +function executePullFile(action: ActionDecision): ActionResult { + const devicePath = action.path ?? ""; + if (!devicePath) return { success: false, message: "No device path provided" }; + // Ensure pulled_files directory exists + const { existsSync, mkdirSync } = require("node:fs"); + if (!existsSync("./pulled_files")) { + mkdirSync("./pulled_files", { recursive: true }); + } + const filename = devicePath.split("/").pop() ?? "file"; + const localPath = `./pulled_files/${filename}`; + console.log(`Pulling file: ${devicePath} → ${localPath}`); + const result = runAdbCommand(["pull", devicePath, localPath]); + return { success: true, message: `Pulled ${devicePath} → ${localPath}`, data: result }; +} + +/** + * Pushes a file from local machine to device. + */ +function executePushFile(action: ActionDecision): ActionResult { + const source = action.source ?? ""; + const dest = action.dest ?? ""; + if (!source || !dest) return { success: false, message: "Missing source or dest path" }; + console.log(`Pushing file: ${source} → ${dest}`); + const result = runAdbCommand(["push", source, dest]); + return { success: true, message: `Pushed ${source} → ${dest}`, data: result }; +} + +/** + * Sends any Android keycode. Escape hatch for keys not covered by other actions. + */ +function executeKeyevent(action: ActionDecision): ActionResult { + const code = action.code; + if (code == null) return { success: false, message: "No keycode provided" }; + console.log(`Sending keyevent: ${code}`); + runAdbCommand(["shell", "input", "keyevent", String(code)]); + return { success: true, message: `Sent keyevent ${code}` }; +} + +/** + * Opens specific Android settings screens. + */ +const SETTINGS_MAP: Record = { + wifi: "android.settings.WIFI_SETTINGS", + bluetooth: "android.settings.BLUETOOTH_SETTINGS", + display: "android.settings.DISPLAY_SETTINGS", + sound: "android.settings.SOUND_SETTINGS", + battery: "android.settings.BATTERY_SAVER_SETTINGS", + location: "android.settings.LOCATION_SOURCE_SETTINGS", + apps: "android.settings.APPLICATION_SETTINGS", + date: "android.settings.DATE_SETTINGS", + accessibility: "android.settings.ACCESSIBILITY_SETTINGS", + developer: "android.settings.APPLICATION_DEVELOPMENT_SETTINGS", +}; + +function executeOpenSettings(action: ActionDecision): ActionResult { + const setting = action.setting ?? ""; + const intentAction = SETTINGS_MAP[setting]; + if (!intentAction) { + const valid = Object.keys(SETTINGS_MAP).join(", "); + return { success: false, message: `Unknown setting "${setting}". Valid: ${valid}` }; + } + console.log(`Opening settings: ${setting}`); + const result = runAdbCommand(["shell", "am", "start", "-a", intentAction]); + return { success: true, message: `Opened ${setting} settings`, data: result }; +} + /** * Runs an arbitrary ADB shell command. Use sparingly for edge cases. */ diff --git a/src/kernel.ts b/src/kernel.ts index 9bcceac..53ea033 100644 --- a/src/kernel.ts +++ b/src/kernel.ts @@ -40,6 +40,7 @@ import { executeSkill } from "./skills.js"; import { getLlmProvider, trimMessages, + parseJsonResponse, SYSTEM_PROMPT, type LLMProvider, type ChatMessage, @@ -170,54 +171,11 @@ async function getDecisionStreaming( return parseJsonResponse(accumulated); } -/** - * Sanitizes raw LLM text so it can be parsed as JSON. - * LLMs often put literal newlines inside JSON string values which breaks JSON.parse(). - * This replaces unescaped newlines inside strings with spaces. - */ -function sanitizeJsonText(raw: string): string { - // Replace literal newlines/carriage returns with spaces — valid JSON - // doesn't require newlines, and LLMs often embed them in string values. - return raw.replace(/\n/g, " ").replace(/\r/g, " "); -} - -/** JSON parser with newline sanitization and markdown fallback (for streaming path) */ -function parseJsonResponse(text: string): ActionDecision { - let decision: ActionDecision | null = null; - - // First try raw text - try { - decision = JSON.parse(text); - } catch { - // Try after sanitizing newlines - try { - decision = JSON.parse(sanitizeJsonText(text)); - } catch { - // Try extracting JSON block from markdown or surrounding text - const match = text.match(/\{[\s\S]*\}/); - if (match) { - try { - decision = JSON.parse(sanitizeJsonText(match[0])); - } catch { - // fall through - } - } - } - } - - if (!decision) { - console.log(`Warning: Could not parse streamed response: ${text.slice(0, 200)}`); - return { action: "wait", reason: "Failed to parse response, waiting" }; - } - decision.coordinates = sanitizeCoordinates(decision.coordinates); - return decision; -} - // =========================================== // Main Agent Loop // =========================================== -async function runAgent(goal: string, maxSteps?: number): Promise { +export async function runAgent(goal: string, maxSteps?: number): Promise<{ success: boolean; stepsUsed: number }> { const steps = maxSteps ?? Config.MAX_STEPS; // Phase 1A: Auto-detect screen resolution @@ -485,7 +443,7 @@ async function runAgent(goal: string, maxSteps?: number): Promise { if (decision.action === "done") { console.log("\nTask completed successfully."); logger.finalize(true); - return; + return { success: true, stepsUsed: step + 1 }; } // Wait for UI to update @@ -494,6 +452,7 @@ async function runAgent(goal: string, maxSteps?: number): Promise { console.log("\nMax steps reached. Task may be incomplete."); logger.finalize(false); + return { success: false, stepsUsed: steps }; } // =========================================== @@ -508,7 +467,33 @@ async function main(): Promise { return; } - // Read user input from stdin + // Check for --workflow flag + const workflowIdx = process.argv.findIndex((a) => a === "--workflow" || a.startsWith("--workflow=")); + if (workflowIdx !== -1) { + const arg = process.argv[workflowIdx]; + const workflowFile = arg.includes("=") + ? arg.split("=")[1] + : process.argv[workflowIdx + 1]; + + if (!workflowFile) { + console.log("Error: --workflow requires a JSON file path."); + process.exit(1); + } + + const { runWorkflow } = await import("./workflow.js"); + const workflow = JSON.parse(await Bun.file(workflowFile).text()); + const result = await runWorkflow(workflow); + + console.log(`\n=== Workflow "${result.name}" ===`); + for (const step of result.steps) { + const status = step.success ? "OK" : "FAILED"; + console.log(` [${status}] ${step.goal} (${step.stepsUsed} steps)${step.error ? ` — ${step.error}` : ""}`); + } + console.log(`\nResult: ${result.success ? "All steps completed" : "Some steps failed"}`); + process.exit(result.success ? 0 : 1); + } + + // Interactive mode: read goal from stdin process.stdout.write("Enter your goal: "); const goal = await new Promise((resolve) => { const reader = Bun.stdin.stream().getReader(); diff --git a/src/llm-providers.ts b/src/llm-providers.ts index eb4e6f3..09df13e 100644 --- a/src/llm-providers.ts +++ b/src/llm-providers.ts @@ -26,7 +26,7 @@ import { import { sanitizeCoordinates, type ActionDecision } from "./actions.js"; // =========================================== -// System Prompt — all 15 actions + planning +// System Prompt — all 22 actions + planning // =========================================== export const SYSTEM_PROMPT = `You are an Android Driver Agent. Your job is to achieve the user's goal by navigating the Android UI. @@ -58,7 +58,7 @@ Example: {"think": "I see the Settings app is open. I need to scroll down to find Display settings.", "plan": ["Open Settings", "Navigate to Display", "Change theme to dark", "Verify change"], "planProgress": "Step 2: navigating to Display", "action": "swipe", "direction": "up", "reason": "Scroll down to find Display option"} ═══════════════════════════════════════════ -AVAILABLE ACTIONS (15 total) +AVAILABLE ACTIONS (22 total) ═══════════════════════════════════════════ Navigation (coordinates MUST be a JSON array of TWO separate integers [x, y] — never concatenate them): @@ -77,12 +77,21 @@ App Control: {"action": "launch", "package": "com.whatsapp", "reason": "Open WhatsApp"} {"action": "launch", "uri": "https://maps.google.com/?q=pizza", "reason": "Open URL"} {"action": "launch", "package": "com.whatsapp", "uri": "content://media/external/images/1", "extras": {"android.intent.extra.TEXT": "Check this"}, "reason": "Share image to WhatsApp"} + {"action": "open_url", "url": "https://example.com", "reason": "Open URL in browser"} + {"action": "switch_app", "package": "com.whatsapp", "reason": "Switch to WhatsApp"} + {"action": "open_settings", "setting": "wifi|bluetooth|display|sound|battery|location|apps|date|accessibility|developer", "reason": "Open settings screen"} Data: {"action": "clipboard_get", "reason": "Read clipboard contents"} {"action": "clipboard_set", "text": "copied text", "reason": "Set clipboard"} {"action": "paste", "coordinates": [540, 804], "reason": "Paste clipboard into focused field"} +Device & Files: + {"action": "notifications", "reason": "Read notification bar content"} + {"action": "pull_file", "path": "/sdcard/Download/file.pdf", "reason": "Pull file from device"} + {"action": "push_file", "source": "./file.pdf", "dest": "/sdcard/Download/file.pdf", "reason": "Push file to device"} + {"action": "keyevent", "code": 187, "reason": "Send keycode (187=recent apps, 26=power, etc.)"} + System: {"action": "shell", "command": "am force-stop com.app.broken", "reason": "Kill crashed app"} {"action": "wait", "reason": "Wait for screen to load"} @@ -127,7 +136,7 @@ CRITICAL RULES 7. READ PAGES: Use "read_screen" to collect all text from a page (search results, articles, feeds). It scrolls automatically and copies everything to clipboard. 8. LONG PRESS: Use "longpress" when you see "longClickable": true (context menus, copy/paste, etc). 9. SCROLLING: If the item you need isn't visible, use "scroll" with direction "down" to see more below, or "up" for above. -10. MULTI-APP: To switch apps, use "home" then "launch" the next app. Or use "back" to return. +10. MULTI-APP: Use "switch_app" with the package name to switch directly between apps. Or use "home" then "launch". Use "back" to return within the same app. 11. PASSWORDS: Never log or output the text of password fields. 12. DONE: Say "done" as soon as the goal is achieved. Don't keep acting after success. 13. SUBMIT IN CHAT APPS: Use "submit_message" action instead of "enter" in chat apps. It finds and taps the Send button, waits for a response, and reports new content. Only use "enter" in search bars or web forms. @@ -331,7 +340,7 @@ const actionDecisionSchema = z.object({ think: z.string().optional().describe("Your reasoning about the current screen state and what to do next"), plan: z.array(z.string()).optional().describe("3-5 high-level steps to achieve the goal"), planProgress: z.string().optional().describe("Which plan step you are currently on"), - action: z.string().describe("The action to take: tap, type, scroll, enter, back, home, wait, done, longpress, launch, clear, clipboard_get, clipboard_set, paste, shell, read_screen, submit_message, copy_visible_text, wait_for_content, find_and_tap, compose_email"), + action: z.string().describe("The action to take: tap, type, scroll, enter, back, home, wait, done, longpress, launch, clear, clipboard_get, clipboard_set, paste, shell, open_url, switch_app, notifications, pull_file, push_file, keyevent, open_settings, read_screen, submit_message, copy_visible_text, wait_for_content, find_and_tap, compose_email"), coordinates: z.tuple([z.number(), z.number()]).optional().describe("Target field as [x, y] — used by tap, longpress, type, and paste"), text: z.string().optional().describe("Text to type, clipboard text, or email body for compose_email"), direction: z.string().optional().describe("Scroll direction: up, down, left, right"), @@ -343,6 +352,12 @@ const actionDecisionSchema = z.object({ command: z.string().optional().describe("Shell command to run"), filename: z.string().optional().describe("Screenshot filename"), query: z.string().optional().describe("Email address for compose_email (REQUIRED), search term for find_and_tap (REQUIRED), or filter for copy_visible_text"), + url: z.string().optional().describe("URL to open for open_url action"), + path: z.string().optional().describe("Device file path for pull_file action"), + source: z.string().optional().describe("Local file path for push_file action"), + dest: z.string().optional().describe("Device destination path for push_file action"), + code: z.number().optional().describe("Android keycode number for keyevent action"), + setting: z.string().optional().describe("Setting name for open_settings: wifi, bluetooth, display, sound, battery, location, apps, date, accessibility, developer"), }); class OpenRouterProvider implements LLMProvider { @@ -589,11 +604,11 @@ class BedrockProvider implements LLMProvider { * Sanitizes raw LLM text so it can be parsed as JSON. * LLMs often put literal newlines inside JSON string values which breaks JSON.parse(). */ -function sanitizeJsonText(raw: string): string { +export function sanitizeJsonText(raw: string): string { return raw.replace(/\n/g, " ").replace(/\r/g, " "); } -function parseJsonResponse(text: string): ActionDecision { +export function parseJsonResponse(text: string): ActionDecision { let decision: ActionDecision | null = null; try { decision = JSON.parse(text); diff --git a/src/workflow.ts b/src/workflow.ts new file mode 100644 index 0000000..144737c --- /dev/null +++ b/src/workflow.ts @@ -0,0 +1,134 @@ +/** + * Workflow orchestration engine for DroidClaw. + * + * Executes a sequence of sub-goals, each optionally scoped to a specific app. + * This is DroidClaw's equivalent of `analyze_and_act(sub_goal, app)`. + * + * Usage: + * bun run src/kernel.ts --workflow examples/logistics-workflow.json + */ + +import { runAgent } from "./kernel.js"; +import { runAdbCommand } from "./actions.js"; + +// =========================================== +// Types +// =========================================== + +export interface WorkflowStep { + goal: string; + app?: string; + maxSteps?: number; + formData?: Record; +} + +export interface Workflow { + name: string; + steps: WorkflowStep[]; +} + +export interface StepResult { + goal: string; + app?: string; + success: boolean; + stepsUsed: number; + error?: string; +} + +export interface WorkflowResult { + name: string; + steps: StepResult[]; + success: boolean; +} + +// =========================================== +// Workflow Engine +// =========================================== + +const DEFAULT_STEP_LIMIT = 15; +const APP_LAUNCH_DELAY_MS = 2000; + +/** + * Builds the effective goal string for a workflow step. + * Appends structured form data if present. + */ +function buildGoal(step: WorkflowStep): string { + let goal = step.goal; + + if (step.formData && Object.keys(step.formData).length > 0) { + const lines = Object.entries(step.formData) + .map(([key, value]) => `- ${key}: ${value}`) + .join("\n"); + goal += `\n\nFORM DATA TO FILL:\n${lines}\n\nFind each field on screen and enter the corresponding value.`; + } + + return goal; +} + +/** + * Switches to the specified app by launching it via monkey. + */ +function switchToApp(packageName: string): void { + console.log(`Switching to app: ${packageName}`); + runAdbCommand([ + "shell", "monkey", "-p", packageName, + "-c", "android.intent.category.LAUNCHER", "1", + ]); +} + +/** + * Executes a full workflow: a sequence of sub-goals with optional app switching. + */ +export async function runWorkflow(workflow: Workflow): Promise { + console.log(`\n========================================`); + console.log(`Workflow: ${workflow.name}`); + console.log(`Steps: ${workflow.steps.length}`); + console.log(`========================================`); + + const results: StepResult[] = []; + + for (let i = 0; i < workflow.steps.length; i++) { + const step = workflow.steps[i]; + const total = workflow.steps.length; + + console.log(`\n--- Step ${i + 1}/${total}: ${step.goal} ---`); + + // Switch to target app if specified + if (step.app) { + switchToApp(step.app); + await Bun.sleep(APP_LAUNCH_DELAY_MS); + } + + // Build effective goal with form data + const effectiveGoal = buildGoal(step); + const maxSteps = step.maxSteps ?? DEFAULT_STEP_LIMIT; + + // Execute the sub-goal + let result: StepResult; + try { + const agentResult = await runAgent(effectiveGoal, maxSteps); + result = { + goal: step.goal, + app: step.app, + success: agentResult.success, + stepsUsed: agentResult.stepsUsed, + }; + } catch (err) { + result = { + goal: step.goal, + app: step.app, + success: false, + stepsUsed: 0, + error: (err as Error).message, + }; + } + + results.push(result); + + const status = result.success ? "completed" : "failed"; + console.log(`\nStep ${i + 1} ${status} (${result.stepsUsed} steps used)`); + } + + const allSuccess = results.every((r) => r.success); + return { name: workflow.name, steps: results, success: allSuccess }; +}