fix(agent): use device screen dimensions for scroll/swipe coordinates

Swipe coordinates were hardcoded for 1080x2400 screens, causing scrolls
to fail on devices with different resolutions. Now reads screenWidth and
screenHeight from DeviceInfo and computes coordinates proportionally.
This commit is contained in:
Sanju Sivalingam
2026-02-18 10:48:37 +05:30
parent 81d78684a5
commit a1ec1ac731
2 changed files with 50 additions and 18 deletions

View File

@@ -126,9 +126,14 @@ function diffScreenState(
* Maps an ActionDecision to a WebSocket command object for the device.
* The device companion app receives these and executes the corresponding
* ADB/accessibility action.
*
* @param screenWidth Device screen width in px (from DeviceInfo)
* @param screenHeight Device screen height in px (from DeviceInfo)
*/
function actionToCommand(
action: ActionDecision
action: ActionDecision,
screenWidth = 1080,
screenHeight = 2400
): Record<string, unknown> {
switch (action.action) {
case "tap":
@@ -152,15 +157,22 @@ function actionToCommand(
case "swipe":
case "scroll": {
// Map scroll direction to swipe coordinates (default 1080px wide screen)
// Compute swipe coordinates proportionally from device screen size
const cx = Math.round(screenWidth * 0.5);
const cy = Math.round(screenHeight * 0.5);
const topY = Math.round(screenHeight * 0.167);
const bottomY = Math.round(screenHeight * 0.667);
const leftX = Math.round(screenWidth * 0.167);
const rightX = Math.round(screenWidth * 0.833);
const dir = action.direction ?? "down";
let x1 = 540, y1 = 1600, x2 = 540, y2 = 400;
let x1 = cx, y1 = bottomY, x2 = cx, y2 = topY;
if (dir === "up") {
y1 = 400; y2 = 1600; // swipe from top to bottom = scroll up
y1 = topY; y2 = bottomY; // swipe from top to bottom = scroll up
} else if (dir === "left") {
x1 = 900; y1 = 1200; x2 = 180; y2 = 1200;
x1 = rightX; y1 = cy; x2 = leftX; y2 = cy;
} else if (dir === "right") {
x1 = 180; y1 = 1200; x2 = 900; y2 = 1200;
x1 = leftX; y1 = cy; x2 = rightX; y2 = cy;
}
// dir === "down" uses defaults: swipe from bottom to top = scroll down
return { type: "swipe", x1, y1, x2, y2 };
@@ -314,6 +326,12 @@ export async function runAgentLoop(
deviceId: persistentDeviceId ?? deviceId,
});
// Get device screen dimensions for accurate swipe coordinates
const connectedDevice = sessions.getDevice(deviceId);
const screenWidth = connectedDevice?.deviceInfo?.screenWidth ?? 1080;
const screenHeight = connectedDevice?.deviceInfo?.screenHeight ?? 2400;
console.log(`[Agent ${sessionId}] Device screen: ${screenWidth}x${screenHeight}`);
let stepsUsed = 0;
let success = false;
@@ -576,12 +594,14 @@ export async function runAgentLoop(
const skillResult = await executeSkill(
deviceId,
action as unknown as Record<string, unknown> & { action: string },
elements
elements,
screenWidth,
screenHeight
);
lastActionFeedback = `${actionSig} -> ${skillResult.success ? "OK" : "FAILED"}: ${skillResult.message}`;
} else {
// Regular action: map to WebSocket command and send to device
const command = actionToCommand(action);
const command = actionToCommand(action, screenWidth, screenHeight);
const result = (await sessions.sendCommand(deviceId, command)) as {
success?: boolean;
error?: string;

View File

@@ -54,17 +54,19 @@ export function isSkillAction(action: string): boolean {
export async function executeSkill(
deviceId: string,
action: SkillAction,
currentElements: UIElement[]
currentElements: UIElement[],
screenWidth = 1080,
screenHeight = 2400
): Promise<SkillResult> {
switch (action.action) {
case "copy_visible_text":
return copyVisibleText(deviceId, action, currentElements);
case "find_and_tap":
return findAndTap(deviceId, action, currentElements);
return findAndTap(deviceId, action, currentElements, screenWidth, screenHeight);
case "submit_message":
return submitMessage(deviceId, currentElements);
case "read_screen":
return readScreen(deviceId, currentElements);
return readScreen(deviceId, currentElements, screenWidth, screenHeight);
case "wait_for_content":
return waitForContent(deviceId, currentElements);
case "compose_email":
@@ -93,11 +95,17 @@ async function tap(deviceId: string, x: number, y: number): Promise<void> {
await sessions.sendCommand(deviceId, { type: "tap", x, y });
}
async function swipeDown(deviceId: string): Promise<void> {
// Scroll down = swipe from bottom to top (1080px wide screen defaults)
async function swipeDown(
deviceId: string,
screenWidth = 1080,
screenHeight = 2400
): Promise<void> {
const cx = Math.round(screenWidth * 0.5);
const topY = Math.round(screenHeight * 0.167);
const bottomY = Math.round(screenHeight * 0.667);
await sessions.sendCommand(deviceId, {
type: "swipe",
x1: 540, y1: 1600, x2: 540, y2: 400,
x1: cx, y1: bottomY, x2: cx, y2: topY,
});
}
@@ -187,7 +195,9 @@ async function copyVisibleText(
async function findAndTap(
deviceId: string,
action: SkillAction,
elements: UIElement[]
elements: UIElement[],
screenWidth = 1080,
screenHeight = 2400
): Promise<SkillResult> {
const query = action.query;
if (!query) {
@@ -206,7 +216,7 @@ async function findAndTap(
console.log(
`[Skill] find_and_tap: "${query}" not visible, scrolling down (${i + 1}/${maxScrolls})`
);
await swipeDown(deviceId);
await swipeDown(deviceId, screenWidth, screenHeight);
await sleep(1200);
const { elements: freshElements } = await getScreen(deviceId);
@@ -319,7 +329,9 @@ async function submitMessage(
async function readScreen(
deviceId: string,
elements: UIElement[]
elements: UIElement[],
screenWidth = 1080,
screenHeight = 2400
): Promise<SkillResult> {
const allTexts: string[] = [];
const seenTexts = new Set<string>();
@@ -344,7 +356,7 @@ async function readScreen(
let scrollsDone = 0;
for (let i = 0; i < maxScrolls; i++) {
await swipeDown(deviceId);
await swipeDown(deviceId, screenWidth, screenHeight);
await sleep(1200);
scrollsDone++;