fix(agent): use device screen dimensions for scroll/swipe coordinates

Swipe coordinates were hardcoded for 1080x2400 screens, causing scrolls
to fail on devices with different resolutions. Now reads screenWidth and
screenHeight from DeviceInfo and computes coordinates proportionally.
This commit is contained in:
Sanju Sivalingam
2026-02-18 10:48:37 +05:30
parent 81d78684a5
commit a1ec1ac731
2 changed files with 50 additions and 18 deletions

View File

@@ -126,9 +126,14 @@ function diffScreenState(
* Maps an ActionDecision to a WebSocket command object for the device. * Maps an ActionDecision to a WebSocket command object for the device.
* The device companion app receives these and executes the corresponding * The device companion app receives these and executes the corresponding
* ADB/accessibility action. * ADB/accessibility action.
*
* @param screenWidth Device screen width in px (from DeviceInfo)
* @param screenHeight Device screen height in px (from DeviceInfo)
*/ */
function actionToCommand( function actionToCommand(
action: ActionDecision action: ActionDecision,
screenWidth = 1080,
screenHeight = 2400
): Record<string, unknown> { ): Record<string, unknown> {
switch (action.action) { switch (action.action) {
case "tap": case "tap":
@@ -152,15 +157,22 @@ function actionToCommand(
case "swipe": case "swipe":
case "scroll": { case "scroll": {
// Map scroll direction to swipe coordinates (default 1080px wide screen) // Compute swipe coordinates proportionally from device screen size
const cx = Math.round(screenWidth * 0.5);
const cy = Math.round(screenHeight * 0.5);
const topY = Math.round(screenHeight * 0.167);
const bottomY = Math.round(screenHeight * 0.667);
const leftX = Math.round(screenWidth * 0.167);
const rightX = Math.round(screenWidth * 0.833);
const dir = action.direction ?? "down"; const dir = action.direction ?? "down";
let x1 = 540, y1 = 1600, x2 = 540, y2 = 400; let x1 = cx, y1 = bottomY, x2 = cx, y2 = topY;
if (dir === "up") { if (dir === "up") {
y1 = 400; y2 = 1600; // swipe from top to bottom = scroll up y1 = topY; y2 = bottomY; // swipe from top to bottom = scroll up
} else if (dir === "left") { } else if (dir === "left") {
x1 = 900; y1 = 1200; x2 = 180; y2 = 1200; x1 = rightX; y1 = cy; x2 = leftX; y2 = cy;
} else if (dir === "right") { } else if (dir === "right") {
x1 = 180; y1 = 1200; x2 = 900; y2 = 1200; x1 = leftX; y1 = cy; x2 = rightX; y2 = cy;
} }
// dir === "down" uses defaults: swipe from bottom to top = scroll down // dir === "down" uses defaults: swipe from bottom to top = scroll down
return { type: "swipe", x1, y1, x2, y2 }; return { type: "swipe", x1, y1, x2, y2 };
@@ -314,6 +326,12 @@ export async function runAgentLoop(
deviceId: persistentDeviceId ?? deviceId, deviceId: persistentDeviceId ?? deviceId,
}); });
// Get device screen dimensions for accurate swipe coordinates
const connectedDevice = sessions.getDevice(deviceId);
const screenWidth = connectedDevice?.deviceInfo?.screenWidth ?? 1080;
const screenHeight = connectedDevice?.deviceInfo?.screenHeight ?? 2400;
console.log(`[Agent ${sessionId}] Device screen: ${screenWidth}x${screenHeight}`);
let stepsUsed = 0; let stepsUsed = 0;
let success = false; let success = false;
@@ -576,12 +594,14 @@ export async function runAgentLoop(
const skillResult = await executeSkill( const skillResult = await executeSkill(
deviceId, deviceId,
action as unknown as Record<string, unknown> & { action: string }, action as unknown as Record<string, unknown> & { action: string },
elements elements,
screenWidth,
screenHeight
); );
lastActionFeedback = `${actionSig} -> ${skillResult.success ? "OK" : "FAILED"}: ${skillResult.message}`; lastActionFeedback = `${actionSig} -> ${skillResult.success ? "OK" : "FAILED"}: ${skillResult.message}`;
} else { } else {
// Regular action: map to WebSocket command and send to device // Regular action: map to WebSocket command and send to device
const command = actionToCommand(action); const command = actionToCommand(action, screenWidth, screenHeight);
const result = (await sessions.sendCommand(deviceId, command)) as { const result = (await sessions.sendCommand(deviceId, command)) as {
success?: boolean; success?: boolean;
error?: string; error?: string;

View File

@@ -54,17 +54,19 @@ export function isSkillAction(action: string): boolean {
export async function executeSkill( export async function executeSkill(
deviceId: string, deviceId: string,
action: SkillAction, action: SkillAction,
currentElements: UIElement[] currentElements: UIElement[],
screenWidth = 1080,
screenHeight = 2400
): Promise<SkillResult> { ): Promise<SkillResult> {
switch (action.action) { switch (action.action) {
case "copy_visible_text": case "copy_visible_text":
return copyVisibleText(deviceId, action, currentElements); return copyVisibleText(deviceId, action, currentElements);
case "find_and_tap": case "find_and_tap":
return findAndTap(deviceId, action, currentElements); return findAndTap(deviceId, action, currentElements, screenWidth, screenHeight);
case "submit_message": case "submit_message":
return submitMessage(deviceId, currentElements); return submitMessage(deviceId, currentElements);
case "read_screen": case "read_screen":
return readScreen(deviceId, currentElements); return readScreen(deviceId, currentElements, screenWidth, screenHeight);
case "wait_for_content": case "wait_for_content":
return waitForContent(deviceId, currentElements); return waitForContent(deviceId, currentElements);
case "compose_email": case "compose_email":
@@ -93,11 +95,17 @@ async function tap(deviceId: string, x: number, y: number): Promise<void> {
await sessions.sendCommand(deviceId, { type: "tap", x, y }); await sessions.sendCommand(deviceId, { type: "tap", x, y });
} }
async function swipeDown(deviceId: string): Promise<void> { async function swipeDown(
// Scroll down = swipe from bottom to top (1080px wide screen defaults) deviceId: string,
screenWidth = 1080,
screenHeight = 2400
): Promise<void> {
const cx = Math.round(screenWidth * 0.5);
const topY = Math.round(screenHeight * 0.167);
const bottomY = Math.round(screenHeight * 0.667);
await sessions.sendCommand(deviceId, { await sessions.sendCommand(deviceId, {
type: "swipe", type: "swipe",
x1: 540, y1: 1600, x2: 540, y2: 400, x1: cx, y1: bottomY, x2: cx, y2: topY,
}); });
} }
@@ -187,7 +195,9 @@ async function copyVisibleText(
async function findAndTap( async function findAndTap(
deviceId: string, deviceId: string,
action: SkillAction, action: SkillAction,
elements: UIElement[] elements: UIElement[],
screenWidth = 1080,
screenHeight = 2400
): Promise<SkillResult> { ): Promise<SkillResult> {
const query = action.query; const query = action.query;
if (!query) { if (!query) {
@@ -206,7 +216,7 @@ async function findAndTap(
console.log( console.log(
`[Skill] find_and_tap: "${query}" not visible, scrolling down (${i + 1}/${maxScrolls})` `[Skill] find_and_tap: "${query}" not visible, scrolling down (${i + 1}/${maxScrolls})`
); );
await swipeDown(deviceId); await swipeDown(deviceId, screenWidth, screenHeight);
await sleep(1200); await sleep(1200);
const { elements: freshElements } = await getScreen(deviceId); const { elements: freshElements } = await getScreen(deviceId);
@@ -319,7 +329,9 @@ async function submitMessage(
async function readScreen( async function readScreen(
deviceId: string, deviceId: string,
elements: UIElement[] elements: UIElement[],
screenWidth = 1080,
screenHeight = 2400
): Promise<SkillResult> { ): Promise<SkillResult> {
const allTexts: string[] = []; const allTexts: string[] = [];
const seenTexts = new Set<string>(); const seenTexts = new Set<string>();
@@ -344,7 +356,7 @@ async function readScreen(
let scrollsDone = 0; let scrollsDone = 0;
for (let i = 0; i < maxScrolls; i++) { for (let i = 0; i < maxScrolls; i++) {
await swipeDown(deviceId); await swipeDown(deviceId, screenWidth, screenHeight);
await sleep(1200); await sleep(1200);
scrollsDone++; scrollsDone++;