time to rethink architecture of this...

This commit is contained in:
Sanju Sivalingam
2026-02-07 18:04:46 +05:30
parent 76b7bce405
commit 5c471ec19a
43 changed files with 8991 additions and 57 deletions

476
src/skills.ts Normal file
View File

@@ -0,0 +1,476 @@
/**
* Skills module for DroidClaw.
* Multi-step smart actions that reduce LLM decision points and eliminate
* entire categories of errors (coordinate guessing, wrong submit buttons, etc.)
*
* Skills:
* submit_message — Find and tap the Send/Submit button in chat apps
* copy_visible_text — Read text from screen elements and set clipboard programmatically
* wait_for_content — Wait for new content to appear (AI responses, page loads)
* find_and_tap — Find an element by text label and tap it
* compose_email — Fill email fields in correct order (To, Subject, Body)
*/
import { existsSync, readFileSync } from "fs";
import { Config } from "./config.js";
import { runAdbCommand, getSwipeCoords, type ActionDecision, type ActionResult } from "./actions.js";
import { getInteractiveElements, type UIElement } from "./sanitizer.js";
import { SWIPE_DURATION_MS } from "./constants.js";
/**
* Routes a skill action to the appropriate skill function.
*/
export function executeSkill(
decision: ActionDecision,
elements: UIElement[]
): ActionResult {
const skill = decision.skill ?? decision.action;
console.log(`Executing multi-step action: ${skill}`);
switch (skill) {
case "read_screen":
return readScreen(elements);
case "submit_message":
return submitMessage(elements);
case "copy_visible_text":
return copyVisibleText(decision, elements);
case "wait_for_content":
return waitForContent(elements);
case "find_and_tap":
return findAndTap(decision, elements);
case "compose_email":
return composeEmail(decision, elements);
default:
return { success: false, message: `Unknown skill: ${skill}` };
}
}
// ===========================================
// Helper: re-scan screen
// ===========================================
/**
* Sets clipboard text via ADB with proper shell escaping.
* ADB shell joins args into a single string, so parentheses/quotes break it.
* Wrapping in single quotes and escaping internal quotes fixes this.
*/
function safeClipboardSet(text: string): void {
const escaped = text.replaceAll("'", "'\\''");
runAdbCommand(["shell", `cmd clipboard set-text '${escaped}'`]);
}
function rescanScreen(): UIElement[] {
try {
runAdbCommand(["shell", "uiautomator", "dump", Config.SCREEN_DUMP_PATH]);
runAdbCommand(["pull", Config.SCREEN_DUMP_PATH, Config.LOCAL_DUMP_PATH]);
} catch {
console.log("Warning: ADB screen capture failed during skill re-scan.");
return [];
}
if (!existsSync(Config.LOCAL_DUMP_PATH)) return [];
const xmlContent = readFileSync(Config.LOCAL_DUMP_PATH, "utf-8");
return getInteractiveElements(xmlContent);
}
// ===========================================
// Skill 0: read_screen (scroll + collect all text)
// ===========================================
function readScreen(elements: UIElement[]): ActionResult {
const allTexts: string[] = [];
const seenTexts = new Set<string>();
function collectTexts(els: UIElement[]): number {
let added = 0;
for (const el of els) {
if (el.text && !seenTexts.has(el.text)) {
seenTexts.add(el.text);
allTexts.push(el.text);
added++;
}
}
return added;
}
// 1. Collect from initial screen
collectTexts(elements);
// 2. Scroll down and collect until no new content
const swipeCoords = getSwipeCoords();
const upCoords = swipeCoords["up"]; // swipe up = scroll down = see more below
const maxScrolls = 5;
let scrollsDone = 0;
for (let i = 0; i < maxScrolls; i++) {
runAdbCommand([
"shell", "input", "swipe",
String(upCoords[0]), String(upCoords[1]),
String(upCoords[2]), String(upCoords[3]),
SWIPE_DURATION_MS,
]);
Bun.sleepSync(1500);
scrollsDone++;
const newElements = rescanScreen();
const added = collectTexts(newElements);
console.log(`read_screen: Scroll ${scrollsDone} — found ${added} new text elements`);
if (added === 0) break;
}
const combinedText = allTexts.join("\n");
// 3. Copy to clipboard for easy access
if (combinedText.length > 0) {
safeClipboardSet(combinedText);
}
return {
success: true,
message: `Read ${allTexts.length} text elements across ${scrollsDone} scrolls (${combinedText.length} chars), copied to clipboard`,
data: combinedText,
};
}
// ===========================================
// Skill 1: submit_message
// ===========================================
const SEND_BUTTON_PATTERN = /send|submit|post|arrow|paper.?plane/i;
function submitMessage(elements: UIElement[]): ActionResult {
// 1. Search for Send/Submit button by text
let candidates = elements.filter(
(el) =>
el.enabled &&
(el.clickable || el.action === "tap") &&
(SEND_BUTTON_PATTERN.test(el.text) || SEND_BUTTON_PATTERN.test(el.id))
);
// 2. If no text match, look for clickable elements in the bottom 20% of screen
// near the right side (common Send button position)
if (candidates.length === 0) {
const screenBottom = elements
.filter((el) => el.enabled && el.clickable)
.sort((a, b) => b.center[1] - a.center[1]);
// Take elements in the bottom 20% by Y coordinate
if (screenBottom.length > 0) {
const maxY = screenBottom[0].center[1];
const threshold = maxY * 0.8;
candidates = screenBottom.filter((el) => el.center[1] >= threshold);
// Prefer rightmost element (Send buttons are usually on the right)
candidates.sort((a, b) => b.center[0] - a.center[0]);
}
}
if (candidates.length === 0) {
return {
success: false,
message: "Could not find a Send/Submit button on screen",
};
}
// 3. Tap the best match
const target = candidates[0];
const [x, y] = target.center;
console.log(
`submit_message: Tapping "${target.text}" at (${x}, ${y})`
);
runAdbCommand(["shell", "input", "tap", String(x), String(y)]);
// 4. Wait for response to generate
console.log("submit_message: Waiting 6s for response...");
Bun.sleepSync(6000);
// 5. Re-scan screen and check for new content
const newElements = rescanScreen();
const originalTexts = new Set(elements.map((el) => el.text).filter(Boolean));
const newTexts = newElements
.map((el) => el.text)
.filter((t) => t && !originalTexts.has(t));
if (newTexts.length > 0) {
const summary = newTexts.slice(0, 3).join("; ");
return {
success: true,
message: `Tapped "${target.text}" and new content appeared: ${summary}`,
data: summary,
};
}
return {
success: true,
message: `Tapped "${target.text}" at (${x}, ${y}). No new content yet — may still be loading.`,
};
}
// ===========================================
// Skill 2: copy_visible_text
// ===========================================
function copyVisibleText(
decision: ActionDecision,
elements: UIElement[]
): ActionResult {
// 1. Filter for readable text elements
let textElements = elements.filter(
(el) => el.text && el.action === "read"
);
// 2. If query provided, filter to matching elements
if (decision.query) {
const query = decision.query.toLowerCase();
textElements = textElements.filter((el) =>
el.text.toLowerCase().includes(query)
);
}
// If no read-only text, include all elements with text
if (textElements.length === 0) {
textElements = elements.filter((el) => el.text);
if (decision.query) {
const query = decision.query.toLowerCase();
textElements = textElements.filter((el) =>
el.text.toLowerCase().includes(query)
);
}
}
if (textElements.length === 0) {
return {
success: false,
message: decision.query
? `No text matching "${decision.query}" found on screen`
: "No readable text found on screen",
};
}
// 3. Sort by vertical position (top to bottom)
textElements.sort((a, b) => a.center[1] - b.center[1]);
// 4. Concatenate text
const combinedText = textElements.map((el) => el.text).join("\n");
// 5. Set clipboard programmatically
console.log(
`copy_visible_text: Copying ${textElements.length} text elements (${combinedText.length} chars)`
);
safeClipboardSet(combinedText);
return {
success: true,
message: `Copied ${textElements.length} text elements to clipboard (${combinedText.length} chars)`,
data: combinedText,
};
}
// ===========================================
// Skill 3: wait_for_content
// ===========================================
function waitForContent(elements: UIElement[]): ActionResult {
// 1. Record current element texts
const originalTexts = new Set(elements.map((el) => el.text).filter(Boolean));
// 2. Poll up to 5 times (3s intervals = 15s max)
for (let i = 0; i < 5; i++) {
console.log(
`wait_for_content: Waiting 3s... (attempt ${i + 1}/5)`
);
Bun.sleepSync(3000);
// Re-scan screen
const newElements = rescanScreen();
const newTexts = newElements
.map((el) => el.text)
.filter((t) => t && !originalTexts.has(t));
// Check if meaningful new content appeared (>20 chars total)
const totalNewChars = newTexts.reduce((sum, t) => sum + t.length, 0);
if (totalNewChars > 20) {
const summary = newTexts.slice(0, 5).join("; ");
console.log(
`wait_for_content: Found ${newTexts.length} new text elements (${totalNewChars} chars)`
);
return {
success: true,
message: `New content appeared after ${(i + 1) * 3}s: ${summary}`,
data: summary,
};
}
}
return {
success: false,
message: "No new content appeared after 15s",
};
}
// ===========================================
// Skill 4: find_and_tap
// ===========================================
function findAndTap(
decision: ActionDecision,
elements: UIElement[]
): ActionResult {
const query = decision.query;
if (!query) {
return { success: false, message: "find_and_tap requires a query" };
}
const queryLower = query.toLowerCase();
// 1. Search elements for text matching query
const matches = elements.filter(
(el) => el.text && el.text.toLowerCase().includes(queryLower)
);
if (matches.length === 0) {
// Return available element texts to help the LLM
const available = elements
.filter((el) => el.text)
.map((el) => el.text)
.slice(0, 15);
return {
success: false,
message: `No element matching "${query}" found. Available: ${available.join(", ")}`,
};
}
// 2. Score matches
const scored = matches.map((el) => {
let score = 0;
if (el.enabled) score += 10;
if (el.clickable || el.longClickable) score += 5;
if (el.text.toLowerCase() === queryLower) score += 20; // exact match
else score += 5; // partial match
return { el, score };
});
// 3. Pick highest-scoring match
scored.sort((a, b) => b.score - a.score);
const best = scored[0].el;
const [x, y] = best.center;
// 4. Tap it
console.log(
`find_and_tap: Tapping "${best.text}" at (${x}, ${y}) [score: ${scored[0].score}]`
);
runAdbCommand(["shell", "input", "tap", String(x), String(y)]);
return {
success: true,
message: `Found and tapped "${best.text}" at (${x}, ${y})`,
data: best.text,
};
}
// ===========================================
// Skill 5: compose_email
// ===========================================
/** Patterns to identify email compose fields by resource ID */
const TO_FIELD_PATTERN = /to|recipient/i;
const SUBJECT_FIELD_PATTERN = /subject/i;
const BODY_FIELD_PATTERN = /body|compose_area|compose_edit|message_content/i;
/** Patterns to identify fields by hint text */
const TO_HINT_PATTERN = /^to$|recipient|email.?address/i;
const SUBJECT_HINT_PATTERN = /subject/i;
const BODY_HINT_PATTERN = /compose|body|message|write/i;
/**
* Finds an editable field matching the given ID and hint patterns.
* Falls back to positional matching if patterns don't match.
*/
function findEmailField(
editables: UIElement[],
idPattern: RegExp,
hintPattern: RegExp
): UIElement | undefined {
// Try resource ID first (most reliable)
const byId = editables.find((el) => idPattern.test(el.id));
if (byId) return byId;
// Try hint text
const byHint = editables.find((el) => el.hint && hintPattern.test(el.hint));
if (byHint) return byHint;
// Try visible label/text
const byText = editables.find((el) => idPattern.test(el.text));
if (byText) return byText;
return undefined;
}
/** Try to extract an email address from a string */
function extractEmail(text: string): string | null {
const match = text.match(/[\w.+-]+@[\w.-]+\.\w{2,}/);
return match ? match[0] : null;
}
function composeEmail(
decision: ActionDecision,
elements: UIElement[]
): ActionResult {
// Resolve email address: try query first, then extract from text
let emailAddress = decision.query;
const bodyContent = decision.text;
if (!emailAddress && bodyContent) {
const extracted = extractEmail(bodyContent);
if (extracted) {
emailAddress = extracted;
console.log(`compose_email: Extracted email "${emailAddress}" from text field`);
}
}
if (!emailAddress) {
return {
success: false,
message: "compose_email requires query (email address). Example: {\"action\": \"compose_email\", \"query\": \"user@example.com\"}",
};
}
// Always use mailto: intent — this is the most reliable path.
// It opens the default email app with To pre-filled, regardless of current screen.
console.log(`compose_email: Launching mailto:${emailAddress}`);
runAdbCommand([
"shell", "am", "start", "-a", "android.intent.action.SENDTO",
"-d", `mailto:${emailAddress}`,
]);
Bun.sleepSync(2500);
// Re-scan to find the compose screen
const freshElements = rescanScreen();
const editables = freshElements
.filter((el) => el.editable && el.enabled)
.sort((a, b) => a.center[1] - b.center[1]);
if (editables.length === 0) {
return { success: false, message: "Launched email compose but no editable fields appeared" };
}
// Find the body field — mailto: already handled the To field
let bodyField = findEmailField(editables, BODY_FIELD_PATTERN, BODY_HINT_PATTERN);
if (!bodyField) {
// Positional fallback: body is the last/largest editable field
bodyField = editables[editables.length - 1];
}
const [bx, by] = bodyField.center;
console.log(`compose_email: Tapping Body field at (${bx}, ${by})`);
runAdbCommand(["shell", "input", "tap", String(bx), String(by)]);
Bun.sleepSync(300);
// Paste body content — use explicit text if provided, otherwise paste clipboard
if (bodyContent) {
safeClipboardSet(bodyContent);
Bun.sleepSync(200);
}
runAdbCommand(["shell", "input", "keyevent", "279"]); // KEYCODE_PASTE
return {
success: true,
message: `Email compose opened to ${emailAddress}, body pasted`,
};
}