Add auto-scroll to find_and_tap skill for off-screen elements

find_and_tap now scrolls down and rescans (up to 10 times) when the
target element isn't visible on the current screen. Stops as soon as
the element is found — no wasted scrolls. This removes the need for
LLMs to manually scroll-and-check in workflow prompts.

Also simplifies the Gemini-to-WhatsApp workflow prompts.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Sanju Sivalingam
2026-02-16 21:56:34 +05:30
parent 2312f8bece
commit 9e9f398ed6
2 changed files with 60 additions and 33 deletions

View File

@@ -1,23 +1,23 @@
{
"name": "Ask Gemini about DroidClaw and share to WhatsApp",
"name": "Ask Gemini about DunSocial and share to WhatsApp",
"steps": [
{
"app": "com.google.android.apps.bard",
"goal": "Gemini app is now open. Find the text input field and type exactly: 'tell me more about droidclaw.ai' — then look for the send icon button which is located next to the 'fast' button at the bottom of the input area. Tap that send icon to submit the query. Do NOT press enter — you MUST tap the send icon. Wait for the full response to load completely. Do NOT mark done until you can see the AI has generated a response with actual text content on screen.",
"goal": "Type 'tell me more about duntasks.com' in the text input field and tap send. Wait for the full AI response to load before marking done.",
"maxSteps": 10
},
{
"goal": "Gemini has finished generating its response. The Copy button is NOT visible yet — it is located at the BOTTOM of the response, below all the text. You MUST scroll down repeatedly until you reach the very end of the response where action buttons appear. Keep scrolling down until you see a 'Copy' button or a copy icon (it may look like two overlapping squares). Once the Copy button is visible on screen, tap it. After tapping Copy, wait briefly — the text is now in the clipboard. Then mark done.",
"maxSteps": 15
"goal": "Use find_and_tap with query 'Copy' to find and tap the Copy button. It will automatically scroll down to find it. Mark done after it completes.",
"maxSteps": 5
},
{
"app": "com.whatsapp",
"goal": "WhatsApp is now open showing the chat list. Look for a contact or chat named 'vi' or 'Vi' in the visible chat list. IMPORTANT: When you find the contact, tap on the contact NAME TEXT on the right side of the row — do NOT tap on the profile picture/image on the left side. If 'vi' is NOT visible in the chat list, tap the search icon (magnifying glass) at the top of the screen, then type 'vi' in the search field, then tap on the matching contact name text (not the profile image) to open that chat. Do NOT mark done until you are inside the chat conversation with Vi.",
"maxSteps": 10
"goal": "Open the chat with contact 'Vi'. If Vi is visible in the chat list, tap it. If not, search for 'vi' and tap the result. Mark done once inside the chat.",
"maxSteps": 8
},
{
"goal": "You are now inside the WhatsApp chat with Vi. The clipboard already contains the Gemini response from the previous step. Do these actions in this EXACT order: Step 1: Tap the message input field at the bottom of the chat (where it says 'Type a message' or 'Message'). Step 2: Use the 'paste' action — this will paste the clipboard content into the input field. Do NOT try to long-press or manually paste — just use the paste action directly. Step 3: After pasting, you should see text in the input field. Now tap the green Send button (arrow icon) to send the message. Mark done after the message is sent.",
"maxSteps": 10
"goal": "Tap the message input field, paste clipboard content, then tap Send.",
"maxSteps": 8
}
]
}

View File

@@ -311,6 +311,27 @@ function waitForContent(elements: UIElement[]): ActionResult {
// Skill 4: find_and_tap
// ===========================================
/**
* Searches visible elements for a match. Returns the best match or null.
*/
function findMatch(elements: UIElement[], queryLower: string): UIElement | null {
const matches = elements.filter(
(el) => el.text && el.text.toLowerCase().includes(queryLower)
);
if (matches.length === 0) return null;
const scored = matches.map((el) => {
let score = 0;
if (el.enabled) score += 10;
if (el.clickable || el.longClickable) score += 5;
if (el.text.toLowerCase() === queryLower) score += 20;
else score += 5;
return { el, score };
});
scored.sort((a, b) => b.score - a.score);
return scored[0].el;
}
function findAndTap(
decision: ActionDecision,
elements: UIElement[]
@@ -322,42 +343,48 @@ function findAndTap(
const queryLower = query.toLowerCase();
// 1. Search elements for text matching query
const matches = elements.filter(
(el) => el.text && el.text.toLowerCase().includes(queryLower)
);
// 1. Check current screen first
let best = findMatch(elements, queryLower);
if (matches.length === 0) {
// Return available element texts to help the LLM
// 2. If not found, scroll down and re-check (up to 10 scrolls)
if (!best) {
const swipeCoords = getSwipeCoords();
const upCoords = swipeCoords["up"]; // swipe up = scroll down
const maxScrolls = 10;
for (let i = 0; i < maxScrolls; i++) {
console.log(`find_and_tap: "${query}" not visible, scrolling down (${i + 1}/${maxScrolls})`);
runAdbCommand([
"shell", "input", "swipe",
String(upCoords[0]), String(upCoords[1]),
String(upCoords[2]), String(upCoords[3]),
SWIPE_DURATION_MS,
]);
Bun.sleepSync(1500);
const freshElements = rescanScreen();
best = findMatch(freshElements, queryLower);
if (best) {
console.log(`find_and_tap: Found "${query}" after ${i + 1} scroll(s)`);
break;
}
}
}
if (!best) {
const available = elements
.filter((el) => el.text)
.map((el) => el.text)
.slice(0, 15);
return {
success: false,
message: `No element matching "${query}" found. Available: ${available.join(", ")}`,
message: `No element matching "${query}" found after scrolling. Available: ${available.join(", ")}`,
};
}
// 2. Score matches
const scored = matches.map((el) => {
let score = 0;
if (el.enabled) score += 10;
if (el.clickable || el.longClickable) score += 5;
if (el.text.toLowerCase() === queryLower) score += 20; // exact match
else score += 5; // partial match
return { el, score };
});
// 3. Pick highest-scoring match
scored.sort((a, b) => b.score - a.score);
const best = scored[0].el;
// 3. Tap it
const [x, y] = best.center;
// 4. Tap it
console.log(
`find_and_tap: Tapping "${best.text}" at (${x}, ${y}) [score: ${scored[0].score}]`
);
console.log(`find_and_tap: Tapping "${best.text}" at (${x}, ${y})`);
runAdbCommand(["shell", "input", "tap", String(x), String(y)]);
return {