fix(agent): use device screen dimensions for scroll/swipe coordinates
Swipe coordinates were hardcoded for 1080x2400 screens, causing scrolls to fail on devices with different resolutions. Now reads screenWidth and screenHeight from DeviceInfo and computes coordinates proportionally.
This commit is contained in:
@@ -126,9 +126,14 @@ function diffScreenState(
|
||||
* Maps an ActionDecision to a WebSocket command object for the device.
|
||||
* The device companion app receives these and executes the corresponding
|
||||
* ADB/accessibility action.
|
||||
*
|
||||
* @param screenWidth Device screen width in px (from DeviceInfo)
|
||||
* @param screenHeight Device screen height in px (from DeviceInfo)
|
||||
*/
|
||||
function actionToCommand(
|
||||
action: ActionDecision
|
||||
action: ActionDecision,
|
||||
screenWidth = 1080,
|
||||
screenHeight = 2400
|
||||
): Record<string, unknown> {
|
||||
switch (action.action) {
|
||||
case "tap":
|
||||
@@ -152,15 +157,22 @@ function actionToCommand(
|
||||
|
||||
case "swipe":
|
||||
case "scroll": {
|
||||
// Map scroll direction to swipe coordinates (default 1080px wide screen)
|
||||
// Compute swipe coordinates proportionally from device screen size
|
||||
const cx = Math.round(screenWidth * 0.5);
|
||||
const cy = Math.round(screenHeight * 0.5);
|
||||
const topY = Math.round(screenHeight * 0.167);
|
||||
const bottomY = Math.round(screenHeight * 0.667);
|
||||
const leftX = Math.round(screenWidth * 0.167);
|
||||
const rightX = Math.round(screenWidth * 0.833);
|
||||
|
||||
const dir = action.direction ?? "down";
|
||||
let x1 = 540, y1 = 1600, x2 = 540, y2 = 400;
|
||||
let x1 = cx, y1 = bottomY, x2 = cx, y2 = topY;
|
||||
if (dir === "up") {
|
||||
y1 = 400; y2 = 1600; // swipe from top to bottom = scroll up
|
||||
y1 = topY; y2 = bottomY; // swipe from top to bottom = scroll up
|
||||
} else if (dir === "left") {
|
||||
x1 = 900; y1 = 1200; x2 = 180; y2 = 1200;
|
||||
x1 = rightX; y1 = cy; x2 = leftX; y2 = cy;
|
||||
} else if (dir === "right") {
|
||||
x1 = 180; y1 = 1200; x2 = 900; y2 = 1200;
|
||||
x1 = leftX; y1 = cy; x2 = rightX; y2 = cy;
|
||||
}
|
||||
// dir === "down" uses defaults: swipe from bottom to top = scroll down
|
||||
return { type: "swipe", x1, y1, x2, y2 };
|
||||
@@ -314,6 +326,12 @@ export async function runAgentLoop(
|
||||
deviceId: persistentDeviceId ?? deviceId,
|
||||
});
|
||||
|
||||
// Get device screen dimensions for accurate swipe coordinates
|
||||
const connectedDevice = sessions.getDevice(deviceId);
|
||||
const screenWidth = connectedDevice?.deviceInfo?.screenWidth ?? 1080;
|
||||
const screenHeight = connectedDevice?.deviceInfo?.screenHeight ?? 2400;
|
||||
console.log(`[Agent ${sessionId}] Device screen: ${screenWidth}x${screenHeight}`);
|
||||
|
||||
let stepsUsed = 0;
|
||||
let success = false;
|
||||
|
||||
@@ -576,12 +594,14 @@ export async function runAgentLoop(
|
||||
const skillResult = await executeSkill(
|
||||
deviceId,
|
||||
action as unknown as Record<string, unknown> & { action: string },
|
||||
elements
|
||||
elements,
|
||||
screenWidth,
|
||||
screenHeight
|
||||
);
|
||||
lastActionFeedback = `${actionSig} -> ${skillResult.success ? "OK" : "FAILED"}: ${skillResult.message}`;
|
||||
} else {
|
||||
// Regular action: map to WebSocket command and send to device
|
||||
const command = actionToCommand(action);
|
||||
const command = actionToCommand(action, screenWidth, screenHeight);
|
||||
const result = (await sessions.sendCommand(deviceId, command)) as {
|
||||
success?: boolean;
|
||||
error?: string;
|
||||
|
||||
@@ -54,17 +54,19 @@ export function isSkillAction(action: string): boolean {
|
||||
export async function executeSkill(
|
||||
deviceId: string,
|
||||
action: SkillAction,
|
||||
currentElements: UIElement[]
|
||||
currentElements: UIElement[],
|
||||
screenWidth = 1080,
|
||||
screenHeight = 2400
|
||||
): Promise<SkillResult> {
|
||||
switch (action.action) {
|
||||
case "copy_visible_text":
|
||||
return copyVisibleText(deviceId, action, currentElements);
|
||||
case "find_and_tap":
|
||||
return findAndTap(deviceId, action, currentElements);
|
||||
return findAndTap(deviceId, action, currentElements, screenWidth, screenHeight);
|
||||
case "submit_message":
|
||||
return submitMessage(deviceId, currentElements);
|
||||
case "read_screen":
|
||||
return readScreen(deviceId, currentElements);
|
||||
return readScreen(deviceId, currentElements, screenWidth, screenHeight);
|
||||
case "wait_for_content":
|
||||
return waitForContent(deviceId, currentElements);
|
||||
case "compose_email":
|
||||
@@ -93,11 +95,17 @@ async function tap(deviceId: string, x: number, y: number): Promise<void> {
|
||||
await sessions.sendCommand(deviceId, { type: "tap", x, y });
|
||||
}
|
||||
|
||||
async function swipeDown(deviceId: string): Promise<void> {
|
||||
// Scroll down = swipe from bottom to top (1080px wide screen defaults)
|
||||
async function swipeDown(
|
||||
deviceId: string,
|
||||
screenWidth = 1080,
|
||||
screenHeight = 2400
|
||||
): Promise<void> {
|
||||
const cx = Math.round(screenWidth * 0.5);
|
||||
const topY = Math.round(screenHeight * 0.167);
|
||||
const bottomY = Math.round(screenHeight * 0.667);
|
||||
await sessions.sendCommand(deviceId, {
|
||||
type: "swipe",
|
||||
x1: 540, y1: 1600, x2: 540, y2: 400,
|
||||
x1: cx, y1: bottomY, x2: cx, y2: topY,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -187,7 +195,9 @@ async function copyVisibleText(
|
||||
async function findAndTap(
|
||||
deviceId: string,
|
||||
action: SkillAction,
|
||||
elements: UIElement[]
|
||||
elements: UIElement[],
|
||||
screenWidth = 1080,
|
||||
screenHeight = 2400
|
||||
): Promise<SkillResult> {
|
||||
const query = action.query;
|
||||
if (!query) {
|
||||
@@ -206,7 +216,7 @@ async function findAndTap(
|
||||
console.log(
|
||||
`[Skill] find_and_tap: "${query}" not visible, scrolling down (${i + 1}/${maxScrolls})`
|
||||
);
|
||||
await swipeDown(deviceId);
|
||||
await swipeDown(deviceId, screenWidth, screenHeight);
|
||||
await sleep(1200);
|
||||
|
||||
const { elements: freshElements } = await getScreen(deviceId);
|
||||
@@ -319,7 +329,9 @@ async function submitMessage(
|
||||
|
||||
async function readScreen(
|
||||
deviceId: string,
|
||||
elements: UIElement[]
|
||||
elements: UIElement[],
|
||||
screenWidth = 1080,
|
||||
screenHeight = 2400
|
||||
): Promise<SkillResult> {
|
||||
const allTexts: string[] = [];
|
||||
const seenTexts = new Set<string>();
|
||||
@@ -344,7 +356,7 @@ async function readScreen(
|
||||
let scrollsDone = 0;
|
||||
|
||||
for (let i = 0; i < maxScrolls; i++) {
|
||||
await swipeDown(deviceId);
|
||||
await swipeDown(deviceId, screenWidth, screenHeight);
|
||||
await sleep(1200);
|
||||
scrollsDone++;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user