diff --git a/server/src/agent/loop.ts b/server/src/agent/loop.ts index a39d6c6..7cdcddf 100644 --- a/server/src/agent/loop.ts +++ b/server/src/agent/loop.ts @@ -126,9 +126,14 @@ function diffScreenState( * Maps an ActionDecision to a WebSocket command object for the device. * The device companion app receives these and executes the corresponding * ADB/accessibility action. + * + * @param screenWidth Device screen width in px (from DeviceInfo) + * @param screenHeight Device screen height in px (from DeviceInfo) */ function actionToCommand( - action: ActionDecision + action: ActionDecision, + screenWidth = 1080, + screenHeight = 2400 ): Record { switch (action.action) { case "tap": @@ -152,15 +157,22 @@ function actionToCommand( case "swipe": case "scroll": { - // Map scroll direction to swipe coordinates (default 1080px wide screen) + // Compute swipe coordinates proportionally from device screen size + const cx = Math.round(screenWidth * 0.5); + const cy = Math.round(screenHeight * 0.5); + const topY = Math.round(screenHeight * 0.167); + const bottomY = Math.round(screenHeight * 0.667); + const leftX = Math.round(screenWidth * 0.167); + const rightX = Math.round(screenWidth * 0.833); + const dir = action.direction ?? "down"; - let x1 = 540, y1 = 1600, x2 = 540, y2 = 400; + let x1 = cx, y1 = bottomY, x2 = cx, y2 = topY; if (dir === "up") { - y1 = 400; y2 = 1600; // swipe from top to bottom = scroll up + y1 = topY; y2 = bottomY; // swipe from top to bottom = scroll up } else if (dir === "left") { - x1 = 900; y1 = 1200; x2 = 180; y2 = 1200; + x1 = rightX; y1 = cy; x2 = leftX; y2 = cy; } else if (dir === "right") { - x1 = 180; y1 = 1200; x2 = 900; y2 = 1200; + x1 = leftX; y1 = cy; x2 = rightX; y2 = cy; } // dir === "down" uses defaults: swipe from bottom to top = scroll down return { type: "swipe", x1, y1, x2, y2 }; @@ -314,6 +326,12 @@ export async function runAgentLoop( deviceId: persistentDeviceId ?? deviceId, }); + // Get device screen dimensions for accurate swipe coordinates + const connectedDevice = sessions.getDevice(deviceId); + const screenWidth = connectedDevice?.deviceInfo?.screenWidth ?? 1080; + const screenHeight = connectedDevice?.deviceInfo?.screenHeight ?? 2400; + console.log(`[Agent ${sessionId}] Device screen: ${screenWidth}x${screenHeight}`); + let stepsUsed = 0; let success = false; @@ -576,12 +594,14 @@ export async function runAgentLoop( const skillResult = await executeSkill( deviceId, action as unknown as Record & { action: string }, - elements + elements, + screenWidth, + screenHeight ); lastActionFeedback = `${actionSig} -> ${skillResult.success ? "OK" : "FAILED"}: ${skillResult.message}`; } else { // Regular action: map to WebSocket command and send to device - const command = actionToCommand(action); + const command = actionToCommand(action, screenWidth, screenHeight); const result = (await sessions.sendCommand(deviceId, command)) as { success?: boolean; error?: string; diff --git a/server/src/agent/skills.ts b/server/src/agent/skills.ts index 9868cf0..ddc3477 100644 --- a/server/src/agent/skills.ts +++ b/server/src/agent/skills.ts @@ -54,17 +54,19 @@ export function isSkillAction(action: string): boolean { export async function executeSkill( deviceId: string, action: SkillAction, - currentElements: UIElement[] + currentElements: UIElement[], + screenWidth = 1080, + screenHeight = 2400 ): Promise { switch (action.action) { case "copy_visible_text": return copyVisibleText(deviceId, action, currentElements); case "find_and_tap": - return findAndTap(deviceId, action, currentElements); + return findAndTap(deviceId, action, currentElements, screenWidth, screenHeight); case "submit_message": return submitMessage(deviceId, currentElements); case "read_screen": - return readScreen(deviceId, currentElements); + return readScreen(deviceId, currentElements, screenWidth, screenHeight); case "wait_for_content": return waitForContent(deviceId, currentElements); case "compose_email": @@ -93,11 +95,17 @@ async function tap(deviceId: string, x: number, y: number): Promise { await sessions.sendCommand(deviceId, { type: "tap", x, y }); } -async function swipeDown(deviceId: string): Promise { - // Scroll down = swipe from bottom to top (1080px wide screen defaults) +async function swipeDown( + deviceId: string, + screenWidth = 1080, + screenHeight = 2400 +): Promise { + const cx = Math.round(screenWidth * 0.5); + const topY = Math.round(screenHeight * 0.167); + const bottomY = Math.round(screenHeight * 0.667); await sessions.sendCommand(deviceId, { type: "swipe", - x1: 540, y1: 1600, x2: 540, y2: 400, + x1: cx, y1: bottomY, x2: cx, y2: topY, }); } @@ -187,7 +195,9 @@ async function copyVisibleText( async function findAndTap( deviceId: string, action: SkillAction, - elements: UIElement[] + elements: UIElement[], + screenWidth = 1080, + screenHeight = 2400 ): Promise { const query = action.query; if (!query) { @@ -206,7 +216,7 @@ async function findAndTap( console.log( `[Skill] find_and_tap: "${query}" not visible, scrolling down (${i + 1}/${maxScrolls})` ); - await swipeDown(deviceId); + await swipeDown(deviceId, screenWidth, screenHeight); await sleep(1200); const { elements: freshElements } = await getScreen(deviceId); @@ -319,7 +329,9 @@ async function submitMessage( async function readScreen( deviceId: string, - elements: UIElement[] + elements: UIElement[], + screenWidth = 1080, + screenHeight = 2400 ): Promise { const allTexts: string[] = []; const seenTexts = new Set(); @@ -344,7 +356,7 @@ async function readScreen( let scrollsDone = 0; for (let i = 0; i < maxScrolls; i++) { - await swipeDown(deviceId); + await swipeDown(deviceId, screenWidth, screenHeight); await sleep(1200); scrollsDone++;