Add 7 new actions, workflow orchestration, and shell escaping fixes
- New actions: open_url, switch_app, notifications, pull_file, push_file, keyevent, open_settings
- Workflow system: runWorkflow() for multi-app sub-goal sequences with --workflow CLI flag
- Export runAgent() with {success, stepsUsed} return for workflow integration
- Fix clipboard_set shell escaping (single-quote wrapping matching skills.ts)
- Improve type action escaping for backticks, $, !, ?, brackets, braces
- Move parseJsonResponse to llm-providers.ts and export it
- Update SYSTEM_PROMPT and Zod schema for 22 total actions
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
21
examples/logistics-workflow.json
Normal file
21
examples/logistics-workflow.json
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
{
|
||||||
|
"name": "Logistics Agent v2.1",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"app": "com.whatsapp",
|
||||||
|
"goal": "Find the latest Bill of Lading image and save it"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"app": "com.intsig.camscanner",
|
||||||
|
"goal": "Crop and enhance the latest image"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"app": "com.rtspro.factoring",
|
||||||
|
"goal": "Fill out the invoice submission form",
|
||||||
|
"formData": {
|
||||||
|
"Invoice": "#9921",
|
||||||
|
"Amount": "$4,200.00"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
168
src/actions.ts
168
src/actions.ts
@@ -2,10 +2,11 @@
|
|||||||
* Action execution module for DroidClaw.
|
* Action execution module for DroidClaw.
|
||||||
* Handles all ADB commands for interacting with Android devices.
|
* Handles all ADB commands for interacting with Android devices.
|
||||||
*
|
*
|
||||||
* Supported actions (21):
|
* Supported actions (28):
|
||||||
* tap, type, enter, swipe, home, back, wait, done,
|
* tap, type, enter, swipe, home, back, wait, done,
|
||||||
* longpress, screenshot, launch, clear, clipboard_get, clipboard_set, paste, shell,
|
* longpress, screenshot, launch, clear, clipboard_get, clipboard_set, paste, shell,
|
||||||
* submit_message, copy_visible_text, wait_for_content, find_and_tap, compose_email
|
* submit_message, copy_visible_text, wait_for_content, find_and_tap, compose_email,
|
||||||
|
* open_url, switch_app, notifications, pull_file, push_file, keyevent, open_settings
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { Config } from "./config.js";
|
import { Config } from "./config.js";
|
||||||
@@ -47,6 +48,17 @@ export interface ActionDecision {
|
|||||||
// multi-step action fields (Phase 6)
|
// multi-step action fields (Phase 6)
|
||||||
skill?: string; // legacy: kept for backward compat, prefer action field directly
|
skill?: string; // legacy: kept for backward compat, prefer action field directly
|
||||||
query?: string; // email address for compose_email, search term for find_and_tap/copy_visible_text
|
query?: string; // email address for compose_email, search term for find_and_tap/copy_visible_text
|
||||||
|
// open_url action
|
||||||
|
url?: string;
|
||||||
|
// pull_file action
|
||||||
|
path?: string;
|
||||||
|
// push_file action
|
||||||
|
source?: string;
|
||||||
|
dest?: string;
|
||||||
|
// keyevent action
|
||||||
|
code?: number;
|
||||||
|
// open_settings action
|
||||||
|
setting?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface ActionResult {
|
export interface ActionResult {
|
||||||
@@ -186,6 +198,20 @@ export function executeAction(action: ActionDecision): ActionResult {
|
|||||||
return executeShell(action);
|
return executeShell(action);
|
||||||
case "scroll":
|
case "scroll":
|
||||||
return executeScroll(action);
|
return executeScroll(action);
|
||||||
|
case "open_url":
|
||||||
|
return executeOpenUrl(action);
|
||||||
|
case "switch_app":
|
||||||
|
return executeSwitchApp(action);
|
||||||
|
case "notifications":
|
||||||
|
return executeNotifications();
|
||||||
|
case "pull_file":
|
||||||
|
return executePullFile(action);
|
||||||
|
case "push_file":
|
||||||
|
return executePushFile(action);
|
||||||
|
case "keyevent":
|
||||||
|
return executeKeyevent(action);
|
||||||
|
case "open_settings":
|
||||||
|
return executeOpenSettings(action);
|
||||||
default:
|
default:
|
||||||
console.log(`Warning: Unknown action: ${action.action}`);
|
console.log(`Warning: Unknown action: ${action.action}`);
|
||||||
return { success: false, message: `Unknown action: ${action.action}` };
|
return { success: false, message: `Unknown action: ${action.action}` };
|
||||||
@@ -301,17 +327,26 @@ function executeType(action: ActionDecision): ActionResult {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ADB requires %s for spaces, escape special shell characters
|
// ADB requires %s for spaces, escape special shell characters.
|
||||||
|
// Backslash must be escaped first to avoid double-escaping.
|
||||||
const escapedText = text
|
const escapedText = text
|
||||||
.replaceAll("\\", "\\\\")
|
.replaceAll("\\", "\\\\")
|
||||||
.replaceAll("\"", "\\\"")
|
.replaceAll("\"", "\\\"")
|
||||||
.replaceAll("'", "\\'")
|
.replaceAll("'", "\\'")
|
||||||
|
.replaceAll("`", "\\`")
|
||||||
|
.replaceAll("$", "\\$")
|
||||||
|
.replaceAll("!", "\\!")
|
||||||
|
.replaceAll("?", "\\?")
|
||||||
.replaceAll(" ", "%s")
|
.replaceAll(" ", "%s")
|
||||||
.replaceAll("&", "\\&")
|
.replaceAll("&", "\\&")
|
||||||
.replaceAll("|", "\\|")
|
.replaceAll("|", "\\|")
|
||||||
.replaceAll(";", "\\;")
|
.replaceAll(";", "\\;")
|
||||||
.replaceAll("(", "\\(")
|
.replaceAll("(", "\\(")
|
||||||
.replaceAll(")", "\\)")
|
.replaceAll(")", "\\)")
|
||||||
|
.replaceAll("[", "\\[")
|
||||||
|
.replaceAll("]", "\\]")
|
||||||
|
.replaceAll("{", "\\{")
|
||||||
|
.replaceAll("}", "\\}")
|
||||||
.replaceAll("<", "\\<")
|
.replaceAll("<", "\\<")
|
||||||
.replaceAll(">", "\\>");
|
.replaceAll(">", "\\>");
|
||||||
console.log(`Typing: ${text}`);
|
console.log(`Typing: ${text}`);
|
||||||
@@ -483,7 +518,10 @@ function executeClipboardSet(action: ActionDecision): ActionResult {
|
|||||||
const text = action.text ?? "";
|
const text = action.text ?? "";
|
||||||
if (!text) return { success: false, message: "No text to set on clipboard" };
|
if (!text) return { success: false, message: "No text to set on clipboard" };
|
||||||
console.log(`Setting clipboard: ${text.slice(0, 50)}...`);
|
console.log(`Setting clipboard: ${text.slice(0, 50)}...`);
|
||||||
runAdbCommand(["shell", "cmd", "clipboard", "set-text", text]);
|
// Safe shell escaping: wrap in single quotes, escape internal ' as '\''
|
||||||
|
// This matches safeClipboardSet() in skills.ts
|
||||||
|
const escaped = text.replaceAll("'", "'\\''");
|
||||||
|
runAdbCommand(["shell", `cmd clipboard set-text '${escaped}'`]);
|
||||||
return { success: true, message: `Clipboard set to "${text.slice(0, 50)}"` };
|
return { success: true, message: `Clipboard set to "${text.slice(0, 50)}"` };
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -533,6 +571,128 @@ function executeScroll(action: ActionDecision): ActionResult {
|
|||||||
return { success: true, message: `Scrolled ${direction}` };
|
return { success: true, message: `Scrolled ${direction}` };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// Phase 7: New actions
|
||||||
|
// ===========================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Opens a URL in the default browser.
|
||||||
|
*/
|
||||||
|
function executeOpenUrl(action: ActionDecision): ActionResult {
|
||||||
|
const url = action.url ?? "";
|
||||||
|
if (!url) return { success: false, message: "No URL provided" };
|
||||||
|
console.log(`Opening URL: ${url}`);
|
||||||
|
const result = runAdbCommand(["shell", "am", "start", "-a", "android.intent.action.VIEW", "-d", url]);
|
||||||
|
return { success: true, message: `Opened URL: ${url}`, data: result };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Switches to a specific app by package name.
|
||||||
|
*/
|
||||||
|
function executeSwitchApp(action: ActionDecision): ActionResult {
|
||||||
|
const pkg = action.package ?? "";
|
||||||
|
if (!pkg) return { success: false, message: "No package name provided" };
|
||||||
|
console.log(`Switching to app: ${pkg}`);
|
||||||
|
const result = runAdbCommand([
|
||||||
|
"shell", "monkey", "-p", pkg, "-c", "android.intent.category.LAUNCHER", "1",
|
||||||
|
]);
|
||||||
|
return { success: true, message: `Switched to ${pkg}`, data: result };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads notification bar content. Parses title/text from active notifications.
|
||||||
|
*/
|
||||||
|
function executeNotifications(): ActionResult {
|
||||||
|
console.log("Reading notifications");
|
||||||
|
const raw = runAdbCommand(["shell", "dumpsys", "notification", "--noredact"]);
|
||||||
|
// Parse title and text from NotificationRecord sections
|
||||||
|
const notifications: string[] = [];
|
||||||
|
let currentTitle = "";
|
||||||
|
for (const line of raw.split("\n")) {
|
||||||
|
const titleMatch = line.match(/android\.title=(?:String\s*\()?(.*?)(?:\)|$)/);
|
||||||
|
const textMatch = line.match(/android\.text=(?:String\s*\()?(.*?)(?:\)|$)/);
|
||||||
|
if (titleMatch) currentTitle = titleMatch[1].trim();
|
||||||
|
if (textMatch && currentTitle) {
|
||||||
|
notifications.push(`${currentTitle}: ${textMatch[1].trim()}`);
|
||||||
|
currentTitle = "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const summary = notifications.length > 0
|
||||||
|
? notifications.join("\n")
|
||||||
|
: "No notifications found";
|
||||||
|
console.log(`Found ${notifications.length} notifications`);
|
||||||
|
return { success: true, message: `Notifications:\n${summary}`, data: summary };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pulls a file from device to local machine.
|
||||||
|
*/
|
||||||
|
function executePullFile(action: ActionDecision): ActionResult {
|
||||||
|
const devicePath = action.path ?? "";
|
||||||
|
if (!devicePath) return { success: false, message: "No device path provided" };
|
||||||
|
// Ensure pulled_files directory exists
|
||||||
|
const { existsSync, mkdirSync } = require("node:fs");
|
||||||
|
if (!existsSync("./pulled_files")) {
|
||||||
|
mkdirSync("./pulled_files", { recursive: true });
|
||||||
|
}
|
||||||
|
const filename = devicePath.split("/").pop() ?? "file";
|
||||||
|
const localPath = `./pulled_files/${filename}`;
|
||||||
|
console.log(`Pulling file: ${devicePath} → ${localPath}`);
|
||||||
|
const result = runAdbCommand(["pull", devicePath, localPath]);
|
||||||
|
return { success: true, message: `Pulled ${devicePath} → ${localPath}`, data: result };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pushes a file from local machine to device.
|
||||||
|
*/
|
||||||
|
function executePushFile(action: ActionDecision): ActionResult {
|
||||||
|
const source = action.source ?? "";
|
||||||
|
const dest = action.dest ?? "";
|
||||||
|
if (!source || !dest) return { success: false, message: "Missing source or dest path" };
|
||||||
|
console.log(`Pushing file: ${source} → ${dest}`);
|
||||||
|
const result = runAdbCommand(["push", source, dest]);
|
||||||
|
return { success: true, message: `Pushed ${source} → ${dest}`, data: result };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sends any Android keycode. Escape hatch for keys not covered by other actions.
|
||||||
|
*/
|
||||||
|
function executeKeyevent(action: ActionDecision): ActionResult {
|
||||||
|
const code = action.code;
|
||||||
|
if (code == null) return { success: false, message: "No keycode provided" };
|
||||||
|
console.log(`Sending keyevent: ${code}`);
|
||||||
|
runAdbCommand(["shell", "input", "keyevent", String(code)]);
|
||||||
|
return { success: true, message: `Sent keyevent ${code}` };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Opens specific Android settings screens.
|
||||||
|
*/
|
||||||
|
const SETTINGS_MAP: Record<string, string> = {
|
||||||
|
wifi: "android.settings.WIFI_SETTINGS",
|
||||||
|
bluetooth: "android.settings.BLUETOOTH_SETTINGS",
|
||||||
|
display: "android.settings.DISPLAY_SETTINGS",
|
||||||
|
sound: "android.settings.SOUND_SETTINGS",
|
||||||
|
battery: "android.settings.BATTERY_SAVER_SETTINGS",
|
||||||
|
location: "android.settings.LOCATION_SOURCE_SETTINGS",
|
||||||
|
apps: "android.settings.APPLICATION_SETTINGS",
|
||||||
|
date: "android.settings.DATE_SETTINGS",
|
||||||
|
accessibility: "android.settings.ACCESSIBILITY_SETTINGS",
|
||||||
|
developer: "android.settings.APPLICATION_DEVELOPMENT_SETTINGS",
|
||||||
|
};
|
||||||
|
|
||||||
|
function executeOpenSettings(action: ActionDecision): ActionResult {
|
||||||
|
const setting = action.setting ?? "";
|
||||||
|
const intentAction = SETTINGS_MAP[setting];
|
||||||
|
if (!intentAction) {
|
||||||
|
const valid = Object.keys(SETTINGS_MAP).join(", ");
|
||||||
|
return { success: false, message: `Unknown setting "${setting}". Valid: ${valid}` };
|
||||||
|
}
|
||||||
|
console.log(`Opening settings: ${setting}`);
|
||||||
|
const result = runAdbCommand(["shell", "am", "start", "-a", intentAction]);
|
||||||
|
return { success: true, message: `Opened ${setting} settings`, data: result };
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Runs an arbitrary ADB shell command. Use sparingly for edge cases.
|
* Runs an arbitrary ADB shell command. Use sparingly for edge cases.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ import { executeSkill } from "./skills.js";
|
|||||||
import {
|
import {
|
||||||
getLlmProvider,
|
getLlmProvider,
|
||||||
trimMessages,
|
trimMessages,
|
||||||
|
parseJsonResponse,
|
||||||
SYSTEM_PROMPT,
|
SYSTEM_PROMPT,
|
||||||
type LLMProvider,
|
type LLMProvider,
|
||||||
type ChatMessage,
|
type ChatMessage,
|
||||||
@@ -170,54 +171,11 @@ async function getDecisionStreaming(
|
|||||||
return parseJsonResponse(accumulated);
|
return parseJsonResponse(accumulated);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Sanitizes raw LLM text so it can be parsed as JSON.
|
|
||||||
* LLMs often put literal newlines inside JSON string values which breaks JSON.parse().
|
|
||||||
* This replaces unescaped newlines inside strings with spaces.
|
|
||||||
*/
|
|
||||||
function sanitizeJsonText(raw: string): string {
|
|
||||||
// Replace literal newlines/carriage returns with spaces — valid JSON
|
|
||||||
// doesn't require newlines, and LLMs often embed them in string values.
|
|
||||||
return raw.replace(/\n/g, " ").replace(/\r/g, " ");
|
|
||||||
}
|
|
||||||
|
|
||||||
/** JSON parser with newline sanitization and markdown fallback (for streaming path) */
|
|
||||||
function parseJsonResponse(text: string): ActionDecision {
|
|
||||||
let decision: ActionDecision | null = null;
|
|
||||||
|
|
||||||
// First try raw text
|
|
||||||
try {
|
|
||||||
decision = JSON.parse(text);
|
|
||||||
} catch {
|
|
||||||
// Try after sanitizing newlines
|
|
||||||
try {
|
|
||||||
decision = JSON.parse(sanitizeJsonText(text));
|
|
||||||
} catch {
|
|
||||||
// Try extracting JSON block from markdown or surrounding text
|
|
||||||
const match = text.match(/\{[\s\S]*\}/);
|
|
||||||
if (match) {
|
|
||||||
try {
|
|
||||||
decision = JSON.parse(sanitizeJsonText(match[0]));
|
|
||||||
} catch {
|
|
||||||
// fall through
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!decision) {
|
|
||||||
console.log(`Warning: Could not parse streamed response: ${text.slice(0, 200)}`);
|
|
||||||
return { action: "wait", reason: "Failed to parse response, waiting" };
|
|
||||||
}
|
|
||||||
decision.coordinates = sanitizeCoordinates(decision.coordinates);
|
|
||||||
return decision;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ===========================================
|
// ===========================================
|
||||||
// Main Agent Loop
|
// Main Agent Loop
|
||||||
// ===========================================
|
// ===========================================
|
||||||
|
|
||||||
async function runAgent(goal: string, maxSteps?: number): Promise<void> {
|
export async function runAgent(goal: string, maxSteps?: number): Promise<{ success: boolean; stepsUsed: number }> {
|
||||||
const steps = maxSteps ?? Config.MAX_STEPS;
|
const steps = maxSteps ?? Config.MAX_STEPS;
|
||||||
|
|
||||||
// Phase 1A: Auto-detect screen resolution
|
// Phase 1A: Auto-detect screen resolution
|
||||||
@@ -485,7 +443,7 @@ async function runAgent(goal: string, maxSteps?: number): Promise<void> {
|
|||||||
if (decision.action === "done") {
|
if (decision.action === "done") {
|
||||||
console.log("\nTask completed successfully.");
|
console.log("\nTask completed successfully.");
|
||||||
logger.finalize(true);
|
logger.finalize(true);
|
||||||
return;
|
return { success: true, stepsUsed: step + 1 };
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for UI to update
|
// Wait for UI to update
|
||||||
@@ -494,6 +452,7 @@ async function runAgent(goal: string, maxSteps?: number): Promise<void> {
|
|||||||
|
|
||||||
console.log("\nMax steps reached. Task may be incomplete.");
|
console.log("\nMax steps reached. Task may be incomplete.");
|
||||||
logger.finalize(false);
|
logger.finalize(false);
|
||||||
|
return { success: false, stepsUsed: steps };
|
||||||
}
|
}
|
||||||
|
|
||||||
// ===========================================
|
// ===========================================
|
||||||
@@ -508,7 +467,33 @@ async function main(): Promise<void> {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Read user input from stdin
|
// Check for --workflow flag
|
||||||
|
const workflowIdx = process.argv.findIndex((a) => a === "--workflow" || a.startsWith("--workflow="));
|
||||||
|
if (workflowIdx !== -1) {
|
||||||
|
const arg = process.argv[workflowIdx];
|
||||||
|
const workflowFile = arg.includes("=")
|
||||||
|
? arg.split("=")[1]
|
||||||
|
: process.argv[workflowIdx + 1];
|
||||||
|
|
||||||
|
if (!workflowFile) {
|
||||||
|
console.log("Error: --workflow requires a JSON file path.");
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const { runWorkflow } = await import("./workflow.js");
|
||||||
|
const workflow = JSON.parse(await Bun.file(workflowFile).text());
|
||||||
|
const result = await runWorkflow(workflow);
|
||||||
|
|
||||||
|
console.log(`\n=== Workflow "${result.name}" ===`);
|
||||||
|
for (const step of result.steps) {
|
||||||
|
const status = step.success ? "OK" : "FAILED";
|
||||||
|
console.log(` [${status}] ${step.goal} (${step.stepsUsed} steps)${step.error ? ` — ${step.error}` : ""}`);
|
||||||
|
}
|
||||||
|
console.log(`\nResult: ${result.success ? "All steps completed" : "Some steps failed"}`);
|
||||||
|
process.exit(result.success ? 0 : 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Interactive mode: read goal from stdin
|
||||||
process.stdout.write("Enter your goal: ");
|
process.stdout.write("Enter your goal: ");
|
||||||
const goal = await new Promise<string>((resolve) => {
|
const goal = await new Promise<string>((resolve) => {
|
||||||
const reader = Bun.stdin.stream().getReader();
|
const reader = Bun.stdin.stream().getReader();
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ import {
|
|||||||
import { sanitizeCoordinates, type ActionDecision } from "./actions.js";
|
import { sanitizeCoordinates, type ActionDecision } from "./actions.js";
|
||||||
|
|
||||||
// ===========================================
|
// ===========================================
|
||||||
// System Prompt — all 15 actions + planning
|
// System Prompt — all 22 actions + planning
|
||||||
// ===========================================
|
// ===========================================
|
||||||
|
|
||||||
export const SYSTEM_PROMPT = `You are an Android Driver Agent. Your job is to achieve the user's goal by navigating the Android UI.
|
export const SYSTEM_PROMPT = `You are an Android Driver Agent. Your job is to achieve the user's goal by navigating the Android UI.
|
||||||
@@ -58,7 +58,7 @@ Example:
|
|||||||
{"think": "I see the Settings app is open. I need to scroll down to find Display settings.", "plan": ["Open Settings", "Navigate to Display", "Change theme to dark", "Verify change"], "planProgress": "Step 2: navigating to Display", "action": "swipe", "direction": "up", "reason": "Scroll down to find Display option"}
|
{"think": "I see the Settings app is open. I need to scroll down to find Display settings.", "plan": ["Open Settings", "Navigate to Display", "Change theme to dark", "Verify change"], "planProgress": "Step 2: navigating to Display", "action": "swipe", "direction": "up", "reason": "Scroll down to find Display option"}
|
||||||
|
|
||||||
═══════════════════════════════════════════
|
═══════════════════════════════════════════
|
||||||
AVAILABLE ACTIONS (15 total)
|
AVAILABLE ACTIONS (22 total)
|
||||||
═══════════════════════════════════════════
|
═══════════════════════════════════════════
|
||||||
|
|
||||||
Navigation (coordinates MUST be a JSON array of TWO separate integers [x, y] — never concatenate them):
|
Navigation (coordinates MUST be a JSON array of TWO separate integers [x, y] — never concatenate them):
|
||||||
@@ -77,12 +77,21 @@ App Control:
|
|||||||
{"action": "launch", "package": "com.whatsapp", "reason": "Open WhatsApp"}
|
{"action": "launch", "package": "com.whatsapp", "reason": "Open WhatsApp"}
|
||||||
{"action": "launch", "uri": "https://maps.google.com/?q=pizza", "reason": "Open URL"}
|
{"action": "launch", "uri": "https://maps.google.com/?q=pizza", "reason": "Open URL"}
|
||||||
{"action": "launch", "package": "com.whatsapp", "uri": "content://media/external/images/1", "extras": {"android.intent.extra.TEXT": "Check this"}, "reason": "Share image to WhatsApp"}
|
{"action": "launch", "package": "com.whatsapp", "uri": "content://media/external/images/1", "extras": {"android.intent.extra.TEXT": "Check this"}, "reason": "Share image to WhatsApp"}
|
||||||
|
{"action": "open_url", "url": "https://example.com", "reason": "Open URL in browser"}
|
||||||
|
{"action": "switch_app", "package": "com.whatsapp", "reason": "Switch to WhatsApp"}
|
||||||
|
{"action": "open_settings", "setting": "wifi|bluetooth|display|sound|battery|location|apps|date|accessibility|developer", "reason": "Open settings screen"}
|
||||||
|
|
||||||
Data:
|
Data:
|
||||||
{"action": "clipboard_get", "reason": "Read clipboard contents"}
|
{"action": "clipboard_get", "reason": "Read clipboard contents"}
|
||||||
{"action": "clipboard_set", "text": "copied text", "reason": "Set clipboard"}
|
{"action": "clipboard_set", "text": "copied text", "reason": "Set clipboard"}
|
||||||
{"action": "paste", "coordinates": [540, 804], "reason": "Paste clipboard into focused field"}
|
{"action": "paste", "coordinates": [540, 804], "reason": "Paste clipboard into focused field"}
|
||||||
|
|
||||||
|
Device & Files:
|
||||||
|
{"action": "notifications", "reason": "Read notification bar content"}
|
||||||
|
{"action": "pull_file", "path": "/sdcard/Download/file.pdf", "reason": "Pull file from device"}
|
||||||
|
{"action": "push_file", "source": "./file.pdf", "dest": "/sdcard/Download/file.pdf", "reason": "Push file to device"}
|
||||||
|
{"action": "keyevent", "code": 187, "reason": "Send keycode (187=recent apps, 26=power, etc.)"}
|
||||||
|
|
||||||
System:
|
System:
|
||||||
{"action": "shell", "command": "am force-stop com.app.broken", "reason": "Kill crashed app"}
|
{"action": "shell", "command": "am force-stop com.app.broken", "reason": "Kill crashed app"}
|
||||||
{"action": "wait", "reason": "Wait for screen to load"}
|
{"action": "wait", "reason": "Wait for screen to load"}
|
||||||
@@ -127,7 +136,7 @@ CRITICAL RULES
|
|||||||
7. READ PAGES: Use "read_screen" to collect all text from a page (search results, articles, feeds). It scrolls automatically and copies everything to clipboard.
|
7. READ PAGES: Use "read_screen" to collect all text from a page (search results, articles, feeds). It scrolls automatically and copies everything to clipboard.
|
||||||
8. LONG PRESS: Use "longpress" when you see "longClickable": true (context menus, copy/paste, etc).
|
8. LONG PRESS: Use "longpress" when you see "longClickable": true (context menus, copy/paste, etc).
|
||||||
9. SCROLLING: If the item you need isn't visible, use "scroll" with direction "down" to see more below, or "up" for above.
|
9. SCROLLING: If the item you need isn't visible, use "scroll" with direction "down" to see more below, or "up" for above.
|
||||||
10. MULTI-APP: To switch apps, use "home" then "launch" the next app. Or use "back" to return.
|
10. MULTI-APP: Use "switch_app" with the package name to switch directly between apps. Or use "home" then "launch". Use "back" to return within the same app.
|
||||||
11. PASSWORDS: Never log or output the text of password fields.
|
11. PASSWORDS: Never log or output the text of password fields.
|
||||||
12. DONE: Say "done" as soon as the goal is achieved. Don't keep acting after success.
|
12. DONE: Say "done" as soon as the goal is achieved. Don't keep acting after success.
|
||||||
13. SUBMIT IN CHAT APPS: Use "submit_message" action instead of "enter" in chat apps. It finds and taps the Send button, waits for a response, and reports new content. Only use "enter" in search bars or web forms.
|
13. SUBMIT IN CHAT APPS: Use "submit_message" action instead of "enter" in chat apps. It finds and taps the Send button, waits for a response, and reports new content. Only use "enter" in search bars or web forms.
|
||||||
@@ -331,7 +340,7 @@ const actionDecisionSchema = z.object({
|
|||||||
think: z.string().optional().describe("Your reasoning about the current screen state and what to do next"),
|
think: z.string().optional().describe("Your reasoning about the current screen state and what to do next"),
|
||||||
plan: z.array(z.string()).optional().describe("3-5 high-level steps to achieve the goal"),
|
plan: z.array(z.string()).optional().describe("3-5 high-level steps to achieve the goal"),
|
||||||
planProgress: z.string().optional().describe("Which plan step you are currently on"),
|
planProgress: z.string().optional().describe("Which plan step you are currently on"),
|
||||||
action: z.string().describe("The action to take: tap, type, scroll, enter, back, home, wait, done, longpress, launch, clear, clipboard_get, clipboard_set, paste, shell, read_screen, submit_message, copy_visible_text, wait_for_content, find_and_tap, compose_email"),
|
action: z.string().describe("The action to take: tap, type, scroll, enter, back, home, wait, done, longpress, launch, clear, clipboard_get, clipboard_set, paste, shell, open_url, switch_app, notifications, pull_file, push_file, keyevent, open_settings, read_screen, submit_message, copy_visible_text, wait_for_content, find_and_tap, compose_email"),
|
||||||
coordinates: z.tuple([z.number(), z.number()]).optional().describe("Target field as [x, y] — used by tap, longpress, type, and paste"),
|
coordinates: z.tuple([z.number(), z.number()]).optional().describe("Target field as [x, y] — used by tap, longpress, type, and paste"),
|
||||||
text: z.string().optional().describe("Text to type, clipboard text, or email body for compose_email"),
|
text: z.string().optional().describe("Text to type, clipboard text, or email body for compose_email"),
|
||||||
direction: z.string().optional().describe("Scroll direction: up, down, left, right"),
|
direction: z.string().optional().describe("Scroll direction: up, down, left, right"),
|
||||||
@@ -343,6 +352,12 @@ const actionDecisionSchema = z.object({
|
|||||||
command: z.string().optional().describe("Shell command to run"),
|
command: z.string().optional().describe("Shell command to run"),
|
||||||
filename: z.string().optional().describe("Screenshot filename"),
|
filename: z.string().optional().describe("Screenshot filename"),
|
||||||
query: z.string().optional().describe("Email address for compose_email (REQUIRED), search term for find_and_tap (REQUIRED), or filter for copy_visible_text"),
|
query: z.string().optional().describe("Email address for compose_email (REQUIRED), search term for find_and_tap (REQUIRED), or filter for copy_visible_text"),
|
||||||
|
url: z.string().optional().describe("URL to open for open_url action"),
|
||||||
|
path: z.string().optional().describe("Device file path for pull_file action"),
|
||||||
|
source: z.string().optional().describe("Local file path for push_file action"),
|
||||||
|
dest: z.string().optional().describe("Device destination path for push_file action"),
|
||||||
|
code: z.number().optional().describe("Android keycode number for keyevent action"),
|
||||||
|
setting: z.string().optional().describe("Setting name for open_settings: wifi, bluetooth, display, sound, battery, location, apps, date, accessibility, developer"),
|
||||||
});
|
});
|
||||||
|
|
||||||
class OpenRouterProvider implements LLMProvider {
|
class OpenRouterProvider implements LLMProvider {
|
||||||
@@ -589,11 +604,11 @@ class BedrockProvider implements LLMProvider {
|
|||||||
* Sanitizes raw LLM text so it can be parsed as JSON.
|
* Sanitizes raw LLM text so it can be parsed as JSON.
|
||||||
* LLMs often put literal newlines inside JSON string values which breaks JSON.parse().
|
* LLMs often put literal newlines inside JSON string values which breaks JSON.parse().
|
||||||
*/
|
*/
|
||||||
function sanitizeJsonText(raw: string): string {
|
export function sanitizeJsonText(raw: string): string {
|
||||||
return raw.replace(/\n/g, " ").replace(/\r/g, " ");
|
return raw.replace(/\n/g, " ").replace(/\r/g, " ");
|
||||||
}
|
}
|
||||||
|
|
||||||
function parseJsonResponse(text: string): ActionDecision {
|
export function parseJsonResponse(text: string): ActionDecision {
|
||||||
let decision: ActionDecision | null = null;
|
let decision: ActionDecision | null = null;
|
||||||
try {
|
try {
|
||||||
decision = JSON.parse(text);
|
decision = JSON.parse(text);
|
||||||
|
|||||||
134
src/workflow.ts
Normal file
134
src/workflow.ts
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
/**
|
||||||
|
* Workflow orchestration engine for DroidClaw.
|
||||||
|
*
|
||||||
|
* Executes a sequence of sub-goals, each optionally scoped to a specific app.
|
||||||
|
* This is DroidClaw's equivalent of `analyze_and_act(sub_goal, app)`.
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* bun run src/kernel.ts --workflow examples/logistics-workflow.json
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { runAgent } from "./kernel.js";
|
||||||
|
import { runAdbCommand } from "./actions.js";
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// Types
|
||||||
|
// ===========================================
|
||||||
|
|
||||||
|
export interface WorkflowStep {
|
||||||
|
goal: string;
|
||||||
|
app?: string;
|
||||||
|
maxSteps?: number;
|
||||||
|
formData?: Record<string, string>;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface Workflow {
|
||||||
|
name: string;
|
||||||
|
steps: WorkflowStep[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface StepResult {
|
||||||
|
goal: string;
|
||||||
|
app?: string;
|
||||||
|
success: boolean;
|
||||||
|
stepsUsed: number;
|
||||||
|
error?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface WorkflowResult {
|
||||||
|
name: string;
|
||||||
|
steps: StepResult[];
|
||||||
|
success: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===========================================
|
||||||
|
// Workflow Engine
|
||||||
|
// ===========================================
|
||||||
|
|
||||||
|
const DEFAULT_STEP_LIMIT = 15;
|
||||||
|
const APP_LAUNCH_DELAY_MS = 2000;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds the effective goal string for a workflow step.
|
||||||
|
* Appends structured form data if present.
|
||||||
|
*/
|
||||||
|
function buildGoal(step: WorkflowStep): string {
|
||||||
|
let goal = step.goal;
|
||||||
|
|
||||||
|
if (step.formData && Object.keys(step.formData).length > 0) {
|
||||||
|
const lines = Object.entries(step.formData)
|
||||||
|
.map(([key, value]) => `- ${key}: ${value}`)
|
||||||
|
.join("\n");
|
||||||
|
goal += `\n\nFORM DATA TO FILL:\n${lines}\n\nFind each field on screen and enter the corresponding value.`;
|
||||||
|
}
|
||||||
|
|
||||||
|
return goal;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Switches to the specified app by launching it via monkey.
|
||||||
|
*/
|
||||||
|
function switchToApp(packageName: string): void {
|
||||||
|
console.log(`Switching to app: ${packageName}`);
|
||||||
|
runAdbCommand([
|
||||||
|
"shell", "monkey", "-p", packageName,
|
||||||
|
"-c", "android.intent.category.LAUNCHER", "1",
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Executes a full workflow: a sequence of sub-goals with optional app switching.
|
||||||
|
*/
|
||||||
|
export async function runWorkflow(workflow: Workflow): Promise<WorkflowResult> {
|
||||||
|
console.log(`\n========================================`);
|
||||||
|
console.log(`Workflow: ${workflow.name}`);
|
||||||
|
console.log(`Steps: ${workflow.steps.length}`);
|
||||||
|
console.log(`========================================`);
|
||||||
|
|
||||||
|
const results: StepResult[] = [];
|
||||||
|
|
||||||
|
for (let i = 0; i < workflow.steps.length; i++) {
|
||||||
|
const step = workflow.steps[i];
|
||||||
|
const total = workflow.steps.length;
|
||||||
|
|
||||||
|
console.log(`\n--- Step ${i + 1}/${total}: ${step.goal} ---`);
|
||||||
|
|
||||||
|
// Switch to target app if specified
|
||||||
|
if (step.app) {
|
||||||
|
switchToApp(step.app);
|
||||||
|
await Bun.sleep(APP_LAUNCH_DELAY_MS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build effective goal with form data
|
||||||
|
const effectiveGoal = buildGoal(step);
|
||||||
|
const maxSteps = step.maxSteps ?? DEFAULT_STEP_LIMIT;
|
||||||
|
|
||||||
|
// Execute the sub-goal
|
||||||
|
let result: StepResult;
|
||||||
|
try {
|
||||||
|
const agentResult = await runAgent(effectiveGoal, maxSteps);
|
||||||
|
result = {
|
||||||
|
goal: step.goal,
|
||||||
|
app: step.app,
|
||||||
|
success: agentResult.success,
|
||||||
|
stepsUsed: agentResult.stepsUsed,
|
||||||
|
};
|
||||||
|
} catch (err) {
|
||||||
|
result = {
|
||||||
|
goal: step.goal,
|
||||||
|
app: step.app,
|
||||||
|
success: false,
|
||||||
|
stepsUsed: 0,
|
||||||
|
error: (err as Error).message,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
results.push(result);
|
||||||
|
|
||||||
|
const status = result.success ? "completed" : "failed";
|
||||||
|
console.log(`\nStep ${i + 1} ${status} (${result.stepsUsed} steps used)`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const allSuccess = results.every((r) => r.success);
|
||||||
|
return { name: workflow.name, steps: results, success: allSuccess };
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user