droidclaw/src/llm-providers.ts

/**
 * LLM Provider module for DroidClaw.
 * Supports OpenAI, Groq, AWS Bedrock, and OpenRouter (via Vercel AI SDK).
 *
 * Phase 3: Real multimodal vision (image content parts)
 * Phase 4A: Multi-turn conversation memory (ChatMessage[] interface)
 * Phase 5: Streaming responses (getDecisionStream)
 */

import OpenAI from "openai";
import {
  BedrockRuntimeClient,
  InvokeModelCommand,
  InvokeModelWithResponseStreamCommand,
} from "@aws-sdk/client-bedrock-runtime";
import { generateText, streamText } from "ai";
import { createOpenRouter } from "@openrouter/ai-sdk-provider";

import { Config } from "./config.js";
import {
  GROQ_API_BASE_URL,
  BEDROCK_ANTHROPIC_MODELS,
  BEDROCK_META_MODELS,
} from "./constants.js";
import type { ActionDecision } from "./actions.js";

// ===========================================
// System Prompt — all 15 actions + planning
// ===========================================

export const SYSTEM_PROMPT = `You are an Android Driver Agent. Your job is to achieve the user's goal by navigating the Android UI.

You will receive:
1. GOAL — the user's task.
2. FOREGROUND_APP — the currently active app package and activity.
3. SCREEN_CONTEXT — JSON array of interactive UI elements with coordinates and states.
4. SCREENSHOT — an image of the current screen (when available).
5. SCREEN_CHANGE — what changed since your last action (or if the screen is stuck).
6. VISION_FALLBACK — present when the accessibility tree is empty (custom UI / WebView).

Previous conversation turns contain your earlier observations and actions (multi-turn memory).

You must output ONLY a valid JSON object with your next action.

═══════════════════════════════════════════
THINKING & PLANNING
═══════════════════════════════════════════

Before each action, include a "think" field with your reasoning about the current state and what to do next.

Optionally include:
- "plan": an array of 3-5 high-level steps to achieve the goal
- "planProgress": a brief note on which plan step you're currently on

Example:
{"think": "I see the Settings app is open. I need to scroll down to find Display settings.", "plan": ["Open Settings", "Navigate to Display", "Change theme to dark", "Verify change"], "planProgress": "Step 2: navigating to Display", "action": "swipe", "direction": "up", "reason": "Scroll down to find Display option"}

═══════════════════════════════════════════
AVAILABLE ACTIONS (15 total)
═══════════════════════════════════════════

Navigation:
  {"action": "tap", "coordinates": [x, y], "reason": "..."}
  {"action": "longpress", "coordinates": [x, y], "reason": "..."}
  {"action": "swipe", "direction": "up|down|left|right", "reason": "..."}
  {"action": "enter", "reason": "Press Enter/submit"}
  {"action": "back", "reason": "Navigate back"}
  {"action": "home", "reason": "Go to home screen"}

Text Input:
  {"action": "type", "text": "Hello World", "reason": "..."}
  {"action": "clear", "reason": "Clear current text field before typing"}

App Control:
  {"action": "launch", "package": "com.whatsapp", "reason": "Open WhatsApp"}
  {"action": "launch", "uri": "https://maps.google.com/?q=pizza", "reason": "Open URL"}
  {"action": "launch", "package": "com.whatsapp", "uri": "content://media/external/images/1", "extras": {"android.intent.extra.TEXT": "Check this"}, "reason": "Share image to WhatsApp"}

Data:
  {"action": "screenshot", "reason": "Capture current screen"}
  {"action": "screenshot", "filename": "order_confirmation.png", "reason": "Save proof"}
  {"action": "clipboard_get", "reason": "Read clipboard contents"}
  {"action": "clipboard_set", "text": "copied text", "reason": "Set clipboard"}

System:
  {"action": "shell", "command": "am force-stop com.app.broken", "reason": "Kill crashed app"}
  {"action": "wait", "reason": "Wait for screen to load"}
  {"action": "done", "reason": "Task is complete"}

═══════════════════════════════════════════
ELEMENT PROPERTIES YOU WILL SEE
═══════════════════════════════════════════

Each element in SCREEN_CONTEXT has:
- text: visible label or content description
- center: [x, y] coordinates to tap
- action: suggested action — "tap", "type", "longpress", "scroll", or "read"
- enabled: false (only shown when disabled — DO NOT tap disabled elements!)
- checked: true (only shown for ON checkboxes/toggles)
- focused: true (only shown when field has input focus)
- hint: placeholder text (only shown when present)
- editable: true (only shown for text input fields)
- scrollable: true (only shown for scrollable containers)

═══════════════════════════════════════════
CRITICAL RULES
═══════════════════════════════════════════

1. DISABLED ELEMENTS: If "enabled": false, DO NOT tap or interact with it. Find an alternative.
2. TEXT INPUT: If "editable": true, use "clear" first if field has existing text, then "type".
3. ALREADY TYPED: Check your previous actions. Do NOT re-type text you already entered.
4. REPETITION: Do NOT tap the same coordinates twice in a row. If it didn't work, try something else.
5. STUCK: If SCREEN_CHANGE says "NOT changed", your last action had no effect. Change strategy.
6. APP LAUNCH: Use "launch" to directly open apps instead of hunting for icons on the home screen.
7. SCREENSHOTS: Use "screenshot" to capture proof of completed tasks (order confirmations, etc).
8. LONG PRESS: Use "longpress" when you see "longClickable": true (context menus, copy/paste, etc).
9. SCROLLING: If the item you need isn't visible, "swipe" up/down to scroll and find it.
10. MULTI-APP: To switch apps, use "home" then "launch" the next app. Or use "back" to return.
11. PASSWORDS: Never log or output the text of password fields.
12. DONE: Say "done" as soon as the goal is achieved. Don't keep acting after success.
13. SEARCH: After typing in a search field, use "enter" to submit the search.
14. SHARE: To send files/images between apps, use "launch" with uri + extras for Android intents.
15. CLEANUP: If a popup/ad appears, dismiss it with "back" or tap the close button, then continue.`;

// ===========================================
// Chat Message Types (Phase 4A)
// ===========================================

export type ContentPart =
  | { type: "text"; text: string }
  | { type: "image"; base64: string; mimeType: "image/png" | "image/jpeg" };

export interface ChatMessage {
  role: "system" | "user" | "assistant";
  content: string | ContentPart[];
}

// ===========================================
// Provider Interface
// ===========================================

export interface LLMProvider {
  readonly capabilities: {
    supportsImages: boolean;
    supportsStreaming: boolean;
  };
  getDecision(messages: ChatMessage[]): Promise<ActionDecision>;
  getDecisionStream?(messages: ChatMessage[]): AsyncIterable<string>;
}

// ===========================================
// Message Trimming (Phase 4A)
// ===========================================

/**
 * Trims conversation messages to keep within history limit.
 * Always keeps the system message. Drops oldest user/assistant pairs.
 */
export function trimMessages(
  messages: ChatMessage[],
  maxHistorySteps: number
): ChatMessage[] {
  if (messages.length === 0) return messages;

  // System message is always first
  const system = messages[0].role === "system" ? messages[0] : null;
  const rest = system ? messages.slice(1) : messages;

  // Count user/assistant pairs (each step = 1 user + 1 assistant)
  const maxMessages = maxHistorySteps * 2;
  if (rest.length <= maxMessages) {
    return messages;
  }

  const dropped = rest.length - maxMessages;
  const stepsDropped = Math.floor(dropped / 2);
  const trimmed = rest.slice(dropped);

  // Insert a summary note
  const summary: ChatMessage = {
    role: "user",
    content: `[${stepsDropped} earlier steps omitted]`,
  };

  return system ? [system, summary, ...trimmed] : [summary, ...trimmed];
}

// ===========================================
// OpenAI / Groq Provider
// ===========================================

class OpenAIProvider implements LLMProvider {
  private client: OpenAI;
  private model: string;
  readonly capabilities: { supportsImages: boolean; supportsStreaming: boolean };

  constructor() {
    if (Config.LLM_PROVIDER === "groq") {
      this.client = new OpenAI({
        apiKey: Config.GROQ_API_KEY,
        baseURL: GROQ_API_BASE_URL,
      });
      this.model = Config.GROQ_MODEL;
      this.capabilities = { supportsImages: false, supportsStreaming: true };
    } else {
      this.client = new OpenAI({ apiKey: Config.OPENAI_API_KEY });
      this.model = Config.OPENAI_MODEL;
      this.capabilities = { supportsImages: true, supportsStreaming: true };
    }
  }

  private toOpenAIMessages(
    messages: ChatMessage[]
  ): OpenAI.ChatCompletionMessageParam[] {
    return messages.map((msg) => {
      if (typeof msg.content === "string") {
        return { role: msg.role, content: msg.content } as OpenAI.ChatCompletionMessageParam;
      }
      // Convert ContentPart[] to OpenAI format
      const parts: OpenAI.ChatCompletionContentPart[] = msg.content.map(
        (part) => {
          if (part.type === "text") {
            return { type: "text" as const, text: part.text };
          }
          // Image — only for OpenAI (Groq skips images)
          if (this.capabilities.supportsImages) {
            return {
              type: "image_url" as const,
              image_url: {
                url: `data:${part.mimeType};base64,${part.base64}`,
                detail: "low" as const,
              },
            };
          }
          // Groq: convert image to text placeholder
          return { type: "text" as const, text: "[Screenshot attached]" };
        }
      );
      return {
        role: msg.role,
        content: parts,
      } as OpenAI.ChatCompletionMessageParam;
    });
  }

  async getDecision(messages: ChatMessage[]): Promise<ActionDecision> {
    const openaiMessages = this.toOpenAIMessages(messages);
    const response = await this.client.chat.completions.create({
      model: this.model,
      response_format: { type: "json_object" },
      messages: openaiMessages,
    });
    return parseJsonResponse(response.choices[0].message.content ?? "{}");
  }

  async *getDecisionStream(messages: ChatMessage[]): AsyncIterable<string> {
    const openaiMessages = this.toOpenAIMessages(messages);
    const stream = await this.client.chat.completions.create({
      model: this.model,
      response_format: { type: "json_object" },
      messages: openaiMessages,
      stream: true,
    });
    for await (const chunk of stream) {
      const content = chunk.choices[0]?.delta?.content;
      if (content) yield content;
    }
  }
}

// ===========================================
// OpenRouter Provider (Vercel AI SDK)
// ===========================================

class OpenRouterProvider implements LLMProvider {
  private openrouter: ReturnType<typeof createOpenRouter>;
  private model: string;
  readonly capabilities = { supportsImages: true, supportsStreaming: true };

  constructor() {
    this.openrouter = createOpenRouter({
      apiKey: Config.OPENROUTER_API_KEY,
    });
    this.model = Config.OPENROUTER_MODEL;
  }

  private toVercelMessages(messages: ChatMessage[]) {
    // Vercel AI SDK uses a similar format but we need to convert images
    const systemMsg = messages.find((m) => m.role === "system");
    const nonSystem = messages.filter((m) => m.role !== "system");

    const converted = nonSystem.map((msg) => {
      if (typeof msg.content === "string") {
        return { role: msg.role as "user" | "assistant", content: msg.content };
      }
      const parts = msg.content.map((part) => {
        if (part.type === "text") {
          return { type: "text" as const, text: part.text };
        }
        return {
          type: "image" as const,
          image: `data:${part.mimeType};base64,${part.base64}`,
        };
      });
      return { role: msg.role as "user" | "assistant", content: parts };
    });

    return {
      system: typeof systemMsg?.content === "string" ? systemMsg.content : "",
      messages: converted,
    };
  }

  async getDecision(messages: ChatMessage[]): Promise<ActionDecision> {
    const { system, messages: converted } = this.toVercelMessages(messages);
    const result = await generateText({
      model: this.openrouter.chat(this.model),
      system,
      messages: converted as any,
    });
    return parseJsonResponse(result.text);
  }

  async *getDecisionStream(messages: ChatMessage[]): AsyncIterable<string> {
    const { system, messages: converted } = this.toVercelMessages(messages);
    const result = streamText({
      model: this.openrouter.chat(this.model),
      system,
      messages: converted as any,
    });
    for await (const chunk of result.textStream) {
      yield chunk;
    }
  }
}

// ===========================================
// AWS Bedrock Provider
// ===========================================

class BedrockProvider implements LLMProvider {
  private client: BedrockRuntimeClient;
  private model: string;
  readonly capabilities: { supportsImages: boolean; supportsStreaming: boolean };

  constructor() {
    this.client = new BedrockRuntimeClient({ region: Config.AWS_REGION });
    this.model = Config.BEDROCK_MODEL;
    // Only Anthropic models on Bedrock support images
    this.capabilities = {
      supportsImages: this.isAnthropicModel(),
      supportsStreaming: true,
    };
  }

  private isAnthropicModel(): boolean {
    return BEDROCK_ANTHROPIC_MODELS.some((id) => this.model.includes(id));
  }

  private isMetaModel(): boolean {
    return BEDROCK_META_MODELS.some((id) =>
      this.model.toLowerCase().includes(id)
    );
  }

  private buildAnthropicMessages(messages: ChatMessage[]) {
    const systemMsg = messages.find((m) => m.role === "system");
    const nonSystem = messages.filter((m) => m.role !== "system");

    const converted = nonSystem.map((msg) => {
      if (typeof msg.content === "string") {
        return { role: msg.role, content: msg.content };
      }
      const parts = msg.content.map((part) => {
        if (part.type === "text") {
          return { type: "text", text: part.text };
        }
        return {
          type: "image",
          source: {
            type: "base64",
            media_type: part.mimeType,
            data: part.base64,
          },
        };
      });
      return { role: msg.role, content: parts };
    });

    return {
      system: typeof systemMsg?.content === "string" ? systemMsg.content : "",
      messages: converted,
    };
  }

  private buildRequest(messages: ChatMessage[]): string {
    if (this.isAnthropicModel()) {
      const { system, messages: converted } = this.buildAnthropicMessages(messages);
      return JSON.stringify({
        anthropic_version: "bedrock-2023-05-31",
        max_tokens: 1024,
        system,
        messages: converted,
      });
    }

    // For Meta/other models, flatten to single prompt (no multi-turn / image support)
    const systemContent = messages.find((m) => m.role === "system");
    const userMessages = messages
      .filter((m) => m.role === "user")
      .map((m) =>
        typeof m.content === "string"
          ? m.content
          : m.content
              .filter((p) => p.type === "text")
              .map((p) => (p as { type: "text"; text: string }).text)
              .join("\n")
      );
    const lastUserContent = userMessages[userMessages.length - 1] ?? "";
    const sysText =
      typeof systemContent?.content === "string" ? systemContent.content : "";

    if (this.isMetaModel()) {
      return JSON.stringify({
        prompt: `<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n${sysText}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n${lastUserContent}\n\nRespond with ONLY a valid JSON object, no other text.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n`,
        max_gen_len: 512,
        temperature: 0.1,
      });
    }

    return JSON.stringify({
      inputText: `${sysText}\n\n${lastUserContent}\n\nRespond with ONLY a valid JSON object.`,
      textGenerationConfig: {
        maxTokenCount: 512,
        temperature: 0.1,
      },
    });
  }

  private extractResponse(responseBody: Record<string, any>): string {
    if (this.isAnthropicModel()) {
      return responseBody.content[0].text;
    }
    if (this.isMetaModel()) {
      return responseBody.generation ?? "";
    }
    return responseBody.results[0].outputText;
  }

  async getDecision(messages: ChatMessage[]): Promise<ActionDecision> {
    const requestBody = this.buildRequest(messages);
    const command = new InvokeModelCommand({
      modelId: this.model,
      body: new TextEncoder().encode(requestBody),
      contentType: "application/json",
      accept: "application/json",
    });

    const response = await this.client.send(command);
    const responseBody = JSON.parse(new TextDecoder().decode(response.body));
    const resultText = this.extractResponse(responseBody);
    return parseJsonResponse(resultText);
  }

  async *getDecisionStream(messages: ChatMessage[]): AsyncIterable<string> {
    if (!this.isAnthropicModel()) {
      // Fallback: non-streaming for non-Anthropic models
      const decision = await this.getDecision(messages);
      yield JSON.stringify(decision);
      return;
    }

    const { system, messages: converted } = this.buildAnthropicMessages(messages);
    const requestBody = JSON.stringify({
      anthropic_version: "bedrock-2023-05-31",
      max_tokens: 1024,
      system,
      messages: converted,
    });

    const command = new InvokeModelWithResponseStreamCommand({
      modelId: this.model,
      body: new TextEncoder().encode(requestBody),
      contentType: "application/json",
    });

    const response = await this.client.send(command);
    if (response.body) {
      for await (const event of response.body) {
        if (event.chunk?.bytes) {
          const data = JSON.parse(new TextDecoder().decode(event.chunk.bytes));
          if (data.type === "content_block_delta" && data.delta?.text) {
            yield data.delta.text;
          }
        }
      }
    }
  }
}

// ===========================================
// Shared JSON Parsing
// ===========================================

function parseJsonResponse(text: string): ActionDecision {
  try {
    return JSON.parse(text);
  } catch {
    // Try to extract JSON from markdown code blocks or mixed text
    const match = text.match(/\{[\s\S]*?\}/);
    if (match) {
      try {
        return JSON.parse(match[0]);
      } catch {
        // fall through
      }
    }
    console.log(`Warning: Could not parse LLM response: ${text.slice(0, 200)}`);
    return { action: "wait", reason: "Failed to parse response, waiting" };
  }
}

// ===========================================
// Factory
// ===========================================

export function getLlmProvider(): LLMProvider {
  if (Config.LLM_PROVIDER === "bedrock") {
    return new BedrockProvider();
  }
  if (Config.LLM_PROVIDER === "openrouter") {
    return new OpenRouterProvider();
  }
  return new OpenAIProvider();
}