fix(agent): use device screen dimensions for scroll/swipe coordinates
Swipe coordinates were hardcoded for 1080x2400 screens, causing scrolls to fail on devices with different resolutions. Now reads screenWidth and screenHeight from DeviceInfo and computes coordinates proportionally.
This commit is contained in:
@@ -126,9 +126,14 @@ function diffScreenState(
|
|||||||
* Maps an ActionDecision to a WebSocket command object for the device.
|
* Maps an ActionDecision to a WebSocket command object for the device.
|
||||||
* The device companion app receives these and executes the corresponding
|
* The device companion app receives these and executes the corresponding
|
||||||
* ADB/accessibility action.
|
* ADB/accessibility action.
|
||||||
|
*
|
||||||
|
* @param screenWidth Device screen width in px (from DeviceInfo)
|
||||||
|
* @param screenHeight Device screen height in px (from DeviceInfo)
|
||||||
*/
|
*/
|
||||||
function actionToCommand(
|
function actionToCommand(
|
||||||
action: ActionDecision
|
action: ActionDecision,
|
||||||
|
screenWidth = 1080,
|
||||||
|
screenHeight = 2400
|
||||||
): Record<string, unknown> {
|
): Record<string, unknown> {
|
||||||
switch (action.action) {
|
switch (action.action) {
|
||||||
case "tap":
|
case "tap":
|
||||||
@@ -152,15 +157,22 @@ function actionToCommand(
|
|||||||
|
|
||||||
case "swipe":
|
case "swipe":
|
||||||
case "scroll": {
|
case "scroll": {
|
||||||
// Map scroll direction to swipe coordinates (default 1080px wide screen)
|
// Compute swipe coordinates proportionally from device screen size
|
||||||
|
const cx = Math.round(screenWidth * 0.5);
|
||||||
|
const cy = Math.round(screenHeight * 0.5);
|
||||||
|
const topY = Math.round(screenHeight * 0.167);
|
||||||
|
const bottomY = Math.round(screenHeight * 0.667);
|
||||||
|
const leftX = Math.round(screenWidth * 0.167);
|
||||||
|
const rightX = Math.round(screenWidth * 0.833);
|
||||||
|
|
||||||
const dir = action.direction ?? "down";
|
const dir = action.direction ?? "down";
|
||||||
let x1 = 540, y1 = 1600, x2 = 540, y2 = 400;
|
let x1 = cx, y1 = bottomY, x2 = cx, y2 = topY;
|
||||||
if (dir === "up") {
|
if (dir === "up") {
|
||||||
y1 = 400; y2 = 1600; // swipe from top to bottom = scroll up
|
y1 = topY; y2 = bottomY; // swipe from top to bottom = scroll up
|
||||||
} else if (dir === "left") {
|
} else if (dir === "left") {
|
||||||
x1 = 900; y1 = 1200; x2 = 180; y2 = 1200;
|
x1 = rightX; y1 = cy; x2 = leftX; y2 = cy;
|
||||||
} else if (dir === "right") {
|
} else if (dir === "right") {
|
||||||
x1 = 180; y1 = 1200; x2 = 900; y2 = 1200;
|
x1 = leftX; y1 = cy; x2 = rightX; y2 = cy;
|
||||||
}
|
}
|
||||||
// dir === "down" uses defaults: swipe from bottom to top = scroll down
|
// dir === "down" uses defaults: swipe from bottom to top = scroll down
|
||||||
return { type: "swipe", x1, y1, x2, y2 };
|
return { type: "swipe", x1, y1, x2, y2 };
|
||||||
@@ -314,6 +326,12 @@ export async function runAgentLoop(
|
|||||||
deviceId: persistentDeviceId ?? deviceId,
|
deviceId: persistentDeviceId ?? deviceId,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Get device screen dimensions for accurate swipe coordinates
|
||||||
|
const connectedDevice = sessions.getDevice(deviceId);
|
||||||
|
const screenWidth = connectedDevice?.deviceInfo?.screenWidth ?? 1080;
|
||||||
|
const screenHeight = connectedDevice?.deviceInfo?.screenHeight ?? 2400;
|
||||||
|
console.log(`[Agent ${sessionId}] Device screen: ${screenWidth}x${screenHeight}`);
|
||||||
|
|
||||||
let stepsUsed = 0;
|
let stepsUsed = 0;
|
||||||
let success = false;
|
let success = false;
|
||||||
|
|
||||||
@@ -576,12 +594,14 @@ export async function runAgentLoop(
|
|||||||
const skillResult = await executeSkill(
|
const skillResult = await executeSkill(
|
||||||
deviceId,
|
deviceId,
|
||||||
action as unknown as Record<string, unknown> & { action: string },
|
action as unknown as Record<string, unknown> & { action: string },
|
||||||
elements
|
elements,
|
||||||
|
screenWidth,
|
||||||
|
screenHeight
|
||||||
);
|
);
|
||||||
lastActionFeedback = `${actionSig} -> ${skillResult.success ? "OK" : "FAILED"}: ${skillResult.message}`;
|
lastActionFeedback = `${actionSig} -> ${skillResult.success ? "OK" : "FAILED"}: ${skillResult.message}`;
|
||||||
} else {
|
} else {
|
||||||
// Regular action: map to WebSocket command and send to device
|
// Regular action: map to WebSocket command and send to device
|
||||||
const command = actionToCommand(action);
|
const command = actionToCommand(action, screenWidth, screenHeight);
|
||||||
const result = (await sessions.sendCommand(deviceId, command)) as {
|
const result = (await sessions.sendCommand(deviceId, command)) as {
|
||||||
success?: boolean;
|
success?: boolean;
|
||||||
error?: string;
|
error?: string;
|
||||||
|
|||||||
@@ -54,17 +54,19 @@ export function isSkillAction(action: string): boolean {
|
|||||||
export async function executeSkill(
|
export async function executeSkill(
|
||||||
deviceId: string,
|
deviceId: string,
|
||||||
action: SkillAction,
|
action: SkillAction,
|
||||||
currentElements: UIElement[]
|
currentElements: UIElement[],
|
||||||
|
screenWidth = 1080,
|
||||||
|
screenHeight = 2400
|
||||||
): Promise<SkillResult> {
|
): Promise<SkillResult> {
|
||||||
switch (action.action) {
|
switch (action.action) {
|
||||||
case "copy_visible_text":
|
case "copy_visible_text":
|
||||||
return copyVisibleText(deviceId, action, currentElements);
|
return copyVisibleText(deviceId, action, currentElements);
|
||||||
case "find_and_tap":
|
case "find_and_tap":
|
||||||
return findAndTap(deviceId, action, currentElements);
|
return findAndTap(deviceId, action, currentElements, screenWidth, screenHeight);
|
||||||
case "submit_message":
|
case "submit_message":
|
||||||
return submitMessage(deviceId, currentElements);
|
return submitMessage(deviceId, currentElements);
|
||||||
case "read_screen":
|
case "read_screen":
|
||||||
return readScreen(deviceId, currentElements);
|
return readScreen(deviceId, currentElements, screenWidth, screenHeight);
|
||||||
case "wait_for_content":
|
case "wait_for_content":
|
||||||
return waitForContent(deviceId, currentElements);
|
return waitForContent(deviceId, currentElements);
|
||||||
case "compose_email":
|
case "compose_email":
|
||||||
@@ -93,11 +95,17 @@ async function tap(deviceId: string, x: number, y: number): Promise<void> {
|
|||||||
await sessions.sendCommand(deviceId, { type: "tap", x, y });
|
await sessions.sendCommand(deviceId, { type: "tap", x, y });
|
||||||
}
|
}
|
||||||
|
|
||||||
async function swipeDown(deviceId: string): Promise<void> {
|
async function swipeDown(
|
||||||
// Scroll down = swipe from bottom to top (1080px wide screen defaults)
|
deviceId: string,
|
||||||
|
screenWidth = 1080,
|
||||||
|
screenHeight = 2400
|
||||||
|
): Promise<void> {
|
||||||
|
const cx = Math.round(screenWidth * 0.5);
|
||||||
|
const topY = Math.round(screenHeight * 0.167);
|
||||||
|
const bottomY = Math.round(screenHeight * 0.667);
|
||||||
await sessions.sendCommand(deviceId, {
|
await sessions.sendCommand(deviceId, {
|
||||||
type: "swipe",
|
type: "swipe",
|
||||||
x1: 540, y1: 1600, x2: 540, y2: 400,
|
x1: cx, y1: bottomY, x2: cx, y2: topY,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -187,7 +195,9 @@ async function copyVisibleText(
|
|||||||
async function findAndTap(
|
async function findAndTap(
|
||||||
deviceId: string,
|
deviceId: string,
|
||||||
action: SkillAction,
|
action: SkillAction,
|
||||||
elements: UIElement[]
|
elements: UIElement[],
|
||||||
|
screenWidth = 1080,
|
||||||
|
screenHeight = 2400
|
||||||
): Promise<SkillResult> {
|
): Promise<SkillResult> {
|
||||||
const query = action.query;
|
const query = action.query;
|
||||||
if (!query) {
|
if (!query) {
|
||||||
@@ -206,7 +216,7 @@ async function findAndTap(
|
|||||||
console.log(
|
console.log(
|
||||||
`[Skill] find_and_tap: "${query}" not visible, scrolling down (${i + 1}/${maxScrolls})`
|
`[Skill] find_and_tap: "${query}" not visible, scrolling down (${i + 1}/${maxScrolls})`
|
||||||
);
|
);
|
||||||
await swipeDown(deviceId);
|
await swipeDown(deviceId, screenWidth, screenHeight);
|
||||||
await sleep(1200);
|
await sleep(1200);
|
||||||
|
|
||||||
const { elements: freshElements } = await getScreen(deviceId);
|
const { elements: freshElements } = await getScreen(deviceId);
|
||||||
@@ -319,7 +329,9 @@ async function submitMessage(
|
|||||||
|
|
||||||
async function readScreen(
|
async function readScreen(
|
||||||
deviceId: string,
|
deviceId: string,
|
||||||
elements: UIElement[]
|
elements: UIElement[],
|
||||||
|
screenWidth = 1080,
|
||||||
|
screenHeight = 2400
|
||||||
): Promise<SkillResult> {
|
): Promise<SkillResult> {
|
||||||
const allTexts: string[] = [];
|
const allTexts: string[] = [];
|
||||||
const seenTexts = new Set<string>();
|
const seenTexts = new Set<string>();
|
||||||
@@ -344,7 +356,7 @@ async function readScreen(
|
|||||||
let scrollsDone = 0;
|
let scrollsDone = 0;
|
||||||
|
|
||||||
for (let i = 0; i < maxScrolls; i++) {
|
for (let i = 0; i < maxScrolls; i++) {
|
||||||
await swipeDown(deviceId);
|
await swipeDown(deviceId, screenWidth, screenHeight);
|
||||||
await sleep(1200);
|
await sleep(1200);
|
||||||
scrollsDone++;
|
scrollsDone++;
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user