536 lines
19 KiB
TypeScript
536 lines
19 KiB
TypeScript
/**
|
|
* LLM Provider module for DroidClaw.
|
|
* Supports OpenAI, Groq, AWS Bedrock, and OpenRouter (via Vercel AI SDK).
|
|
*
|
|
* Phase 3: Real multimodal vision (image content parts)
|
|
* Phase 4A: Multi-turn conversation memory (ChatMessage[] interface)
|
|
* Phase 5: Streaming responses (getDecisionStream)
|
|
*/
|
|
|
|
import OpenAI from "openai";
|
|
import {
|
|
BedrockRuntimeClient,
|
|
InvokeModelCommand,
|
|
InvokeModelWithResponseStreamCommand,
|
|
} from "@aws-sdk/client-bedrock-runtime";
|
|
import { generateText, streamText } from "ai";
|
|
import { createOpenRouter } from "@openrouter/ai-sdk-provider";
|
|
|
|
import { Config } from "./config.js";
|
|
import {
|
|
GROQ_API_BASE_URL,
|
|
BEDROCK_ANTHROPIC_MODELS,
|
|
BEDROCK_META_MODELS,
|
|
} from "./constants.js";
|
|
import type { ActionDecision } from "./actions.js";
|
|
|
|
// ===========================================
|
|
// System Prompt — all 15 actions + planning
|
|
// ===========================================
|
|
|
|
export const SYSTEM_PROMPT = `You are an Android Driver Agent. Your job is to achieve the user's goal by navigating the Android UI.
|
|
|
|
You will receive:
|
|
1. GOAL — the user's task.
|
|
2. FOREGROUND_APP — the currently active app package and activity.
|
|
3. SCREEN_CONTEXT — JSON array of interactive UI elements with coordinates and states.
|
|
4. SCREENSHOT — an image of the current screen (when available).
|
|
5. SCREEN_CHANGE — what changed since your last action (or if the screen is stuck).
|
|
6. VISION_FALLBACK — present when the accessibility tree is empty (custom UI / WebView).
|
|
|
|
Previous conversation turns contain your earlier observations and actions (multi-turn memory).
|
|
|
|
You must output ONLY a valid JSON object with your next action.
|
|
|
|
═══════════════════════════════════════════
|
|
THINKING & PLANNING
|
|
═══════════════════════════════════════════
|
|
|
|
Before each action, include a "think" field with your reasoning about the current state and what to do next.
|
|
|
|
Optionally include:
|
|
- "plan": an array of 3-5 high-level steps to achieve the goal
|
|
- "planProgress": a brief note on which plan step you're currently on
|
|
|
|
Example:
|
|
{"think": "I see the Settings app is open. I need to scroll down to find Display settings.", "plan": ["Open Settings", "Navigate to Display", "Change theme to dark", "Verify change"], "planProgress": "Step 2: navigating to Display", "action": "swipe", "direction": "up", "reason": "Scroll down to find Display option"}
|
|
|
|
═══════════════════════════════════════════
|
|
AVAILABLE ACTIONS (15 total)
|
|
═══════════════════════════════════════════
|
|
|
|
Navigation:
|
|
{"action": "tap", "coordinates": [x, y], "reason": "..."}
|
|
{"action": "longpress", "coordinates": [x, y], "reason": "..."}
|
|
{"action": "swipe", "direction": "up|down|left|right", "reason": "..."}
|
|
{"action": "enter", "reason": "Press Enter/submit"}
|
|
{"action": "back", "reason": "Navigate back"}
|
|
{"action": "home", "reason": "Go to home screen"}
|
|
|
|
Text Input:
|
|
{"action": "type", "text": "Hello World", "reason": "..."}
|
|
{"action": "clear", "reason": "Clear current text field before typing"}
|
|
|
|
App Control:
|
|
{"action": "launch", "package": "com.whatsapp", "reason": "Open WhatsApp"}
|
|
{"action": "launch", "uri": "https://maps.google.com/?q=pizza", "reason": "Open URL"}
|
|
{"action": "launch", "package": "com.whatsapp", "uri": "content://media/external/images/1", "extras": {"android.intent.extra.TEXT": "Check this"}, "reason": "Share image to WhatsApp"}
|
|
|
|
Data:
|
|
{"action": "screenshot", "reason": "Capture current screen"}
|
|
{"action": "screenshot", "filename": "order_confirmation.png", "reason": "Save proof"}
|
|
{"action": "clipboard_get", "reason": "Read clipboard contents"}
|
|
{"action": "clipboard_set", "text": "copied text", "reason": "Set clipboard"}
|
|
|
|
System:
|
|
{"action": "shell", "command": "am force-stop com.app.broken", "reason": "Kill crashed app"}
|
|
{"action": "wait", "reason": "Wait for screen to load"}
|
|
{"action": "done", "reason": "Task is complete"}
|
|
|
|
═══════════════════════════════════════════
|
|
ELEMENT PROPERTIES YOU WILL SEE
|
|
═══════════════════════════════════════════
|
|
|
|
Each element in SCREEN_CONTEXT has:
|
|
- text: visible label or content description
|
|
- center: [x, y] coordinates to tap
|
|
- action: suggested action — "tap", "type", "longpress", "scroll", or "read"
|
|
- enabled: false (only shown when disabled — DO NOT tap disabled elements!)
|
|
- checked: true (only shown for ON checkboxes/toggles)
|
|
- focused: true (only shown when field has input focus)
|
|
- hint: placeholder text (only shown when present)
|
|
- editable: true (only shown for text input fields)
|
|
- scrollable: true (only shown for scrollable containers)
|
|
|
|
═══════════════════════════════════════════
|
|
CRITICAL RULES
|
|
═══════════════════════════════════════════
|
|
|
|
1. DISABLED ELEMENTS: If "enabled": false, DO NOT tap or interact with it. Find an alternative.
|
|
2. TEXT INPUT: If "editable": true, use "clear" first if field has existing text, then "type".
|
|
3. ALREADY TYPED: Check your previous actions. Do NOT re-type text you already entered.
|
|
4. REPETITION: Do NOT tap the same coordinates twice in a row. If it didn't work, try something else.
|
|
5. STUCK: If SCREEN_CHANGE says "NOT changed", your last action had no effect. Change strategy.
|
|
6. APP LAUNCH: Use "launch" to directly open apps instead of hunting for icons on the home screen.
|
|
7. SCREENSHOTS: Use "screenshot" to capture proof of completed tasks (order confirmations, etc).
|
|
8. LONG PRESS: Use "longpress" when you see "longClickable": true (context menus, copy/paste, etc).
|
|
9. SCROLLING: If the item you need isn't visible, "swipe" up/down to scroll and find it.
|
|
10. MULTI-APP: To switch apps, use "home" then "launch" the next app. Or use "back" to return.
|
|
11. PASSWORDS: Never log or output the text of password fields.
|
|
12. DONE: Say "done" as soon as the goal is achieved. Don't keep acting after success.
|
|
13. SEARCH: After typing in a search field, use "enter" to submit the search.
|
|
14. SHARE: To send files/images between apps, use "launch" with uri + extras for Android intents.
|
|
15. CLEANUP: If a popup/ad appears, dismiss it with "back" or tap the close button, then continue.`;
|
|
|
|
// ===========================================
|
|
// Chat Message Types (Phase 4A)
|
|
// ===========================================
|
|
|
|
export type ContentPart =
|
|
| { type: "text"; text: string }
|
|
| { type: "image"; base64: string; mimeType: "image/png" | "image/jpeg" };
|
|
|
|
export interface ChatMessage {
|
|
role: "system" | "user" | "assistant";
|
|
content: string | ContentPart[];
|
|
}
|
|
|
|
// ===========================================
|
|
// Provider Interface
|
|
// ===========================================
|
|
|
|
export interface LLMProvider {
|
|
readonly capabilities: {
|
|
supportsImages: boolean;
|
|
supportsStreaming: boolean;
|
|
};
|
|
getDecision(messages: ChatMessage[]): Promise<ActionDecision>;
|
|
getDecisionStream?(messages: ChatMessage[]): AsyncIterable<string>;
|
|
}
|
|
|
|
// ===========================================
|
|
// Message Trimming (Phase 4A)
|
|
// ===========================================
|
|
|
|
/**
|
|
* Trims conversation messages to keep within history limit.
|
|
* Always keeps the system message. Drops oldest user/assistant pairs.
|
|
*/
|
|
export function trimMessages(
|
|
messages: ChatMessage[],
|
|
maxHistorySteps: number
|
|
): ChatMessage[] {
|
|
if (messages.length === 0) return messages;
|
|
|
|
// System message is always first
|
|
const system = messages[0].role === "system" ? messages[0] : null;
|
|
const rest = system ? messages.slice(1) : messages;
|
|
|
|
// Count user/assistant pairs (each step = 1 user + 1 assistant)
|
|
const maxMessages = maxHistorySteps * 2;
|
|
if (rest.length <= maxMessages) {
|
|
return messages;
|
|
}
|
|
|
|
const dropped = rest.length - maxMessages;
|
|
const stepsDropped = Math.floor(dropped / 2);
|
|
const trimmed = rest.slice(dropped);
|
|
|
|
// Insert a summary note
|
|
const summary: ChatMessage = {
|
|
role: "user",
|
|
content: `[${stepsDropped} earlier steps omitted]`,
|
|
};
|
|
|
|
return system ? [system, summary, ...trimmed] : [summary, ...trimmed];
|
|
}
|
|
|
|
// ===========================================
|
|
// OpenAI / Groq Provider
|
|
// ===========================================
|
|
|
|
class OpenAIProvider implements LLMProvider {
|
|
private client: OpenAI;
|
|
private model: string;
|
|
readonly capabilities: { supportsImages: boolean; supportsStreaming: boolean };
|
|
|
|
constructor() {
|
|
if (Config.LLM_PROVIDER === "groq") {
|
|
this.client = new OpenAI({
|
|
apiKey: Config.GROQ_API_KEY,
|
|
baseURL: GROQ_API_BASE_URL,
|
|
});
|
|
this.model = Config.GROQ_MODEL;
|
|
this.capabilities = { supportsImages: false, supportsStreaming: true };
|
|
} else {
|
|
this.client = new OpenAI({ apiKey: Config.OPENAI_API_KEY });
|
|
this.model = Config.OPENAI_MODEL;
|
|
this.capabilities = { supportsImages: true, supportsStreaming: true };
|
|
}
|
|
}
|
|
|
|
private toOpenAIMessages(
|
|
messages: ChatMessage[]
|
|
): OpenAI.ChatCompletionMessageParam[] {
|
|
return messages.map((msg) => {
|
|
if (typeof msg.content === "string") {
|
|
return { role: msg.role, content: msg.content } as OpenAI.ChatCompletionMessageParam;
|
|
}
|
|
// Convert ContentPart[] to OpenAI format
|
|
const parts: OpenAI.ChatCompletionContentPart[] = msg.content.map(
|
|
(part) => {
|
|
if (part.type === "text") {
|
|
return { type: "text" as const, text: part.text };
|
|
}
|
|
// Image — only for OpenAI (Groq skips images)
|
|
if (this.capabilities.supportsImages) {
|
|
return {
|
|
type: "image_url" as const,
|
|
image_url: {
|
|
url: `data:${part.mimeType};base64,${part.base64}`,
|
|
detail: "low" as const,
|
|
},
|
|
};
|
|
}
|
|
// Groq: convert image to text placeholder
|
|
return { type: "text" as const, text: "[Screenshot attached]" };
|
|
}
|
|
);
|
|
return {
|
|
role: msg.role,
|
|
content: parts,
|
|
} as OpenAI.ChatCompletionMessageParam;
|
|
});
|
|
}
|
|
|
|
async getDecision(messages: ChatMessage[]): Promise<ActionDecision> {
|
|
const openaiMessages = this.toOpenAIMessages(messages);
|
|
const response = await this.client.chat.completions.create({
|
|
model: this.model,
|
|
response_format: { type: "json_object" },
|
|
messages: openaiMessages,
|
|
});
|
|
return parseJsonResponse(response.choices[0].message.content ?? "{}");
|
|
}
|
|
|
|
async *getDecisionStream(messages: ChatMessage[]): AsyncIterable<string> {
|
|
const openaiMessages = this.toOpenAIMessages(messages);
|
|
const stream = await this.client.chat.completions.create({
|
|
model: this.model,
|
|
response_format: { type: "json_object" },
|
|
messages: openaiMessages,
|
|
stream: true,
|
|
});
|
|
for await (const chunk of stream) {
|
|
const content = chunk.choices[0]?.delta?.content;
|
|
if (content) yield content;
|
|
}
|
|
}
|
|
}
|
|
|
|
// ===========================================
|
|
// OpenRouter Provider (Vercel AI SDK)
|
|
// ===========================================
|
|
|
|
class OpenRouterProvider implements LLMProvider {
|
|
private openrouter: ReturnType<typeof createOpenRouter>;
|
|
private model: string;
|
|
readonly capabilities = { supportsImages: true, supportsStreaming: true };
|
|
|
|
constructor() {
|
|
this.openrouter = createOpenRouter({
|
|
apiKey: Config.OPENROUTER_API_KEY,
|
|
});
|
|
this.model = Config.OPENROUTER_MODEL;
|
|
}
|
|
|
|
private toVercelMessages(messages: ChatMessage[]) {
|
|
// Vercel AI SDK uses a similar format but we need to convert images
|
|
const systemMsg = messages.find((m) => m.role === "system");
|
|
const nonSystem = messages.filter((m) => m.role !== "system");
|
|
|
|
const converted = nonSystem.map((msg) => {
|
|
if (typeof msg.content === "string") {
|
|
return { role: msg.role as "user" | "assistant", content: msg.content };
|
|
}
|
|
const parts = msg.content.map((part) => {
|
|
if (part.type === "text") {
|
|
return { type: "text" as const, text: part.text };
|
|
}
|
|
return {
|
|
type: "image" as const,
|
|
image: `data:${part.mimeType};base64,${part.base64}`,
|
|
};
|
|
});
|
|
return { role: msg.role as "user" | "assistant", content: parts };
|
|
});
|
|
|
|
return {
|
|
system: typeof systemMsg?.content === "string" ? systemMsg.content : "",
|
|
messages: converted,
|
|
};
|
|
}
|
|
|
|
async getDecision(messages: ChatMessage[]): Promise<ActionDecision> {
|
|
const { system, messages: converted } = this.toVercelMessages(messages);
|
|
const result = await generateText({
|
|
model: this.openrouter.chat(this.model),
|
|
system,
|
|
messages: converted as any,
|
|
});
|
|
return parseJsonResponse(result.text);
|
|
}
|
|
|
|
async *getDecisionStream(messages: ChatMessage[]): AsyncIterable<string> {
|
|
const { system, messages: converted } = this.toVercelMessages(messages);
|
|
const result = streamText({
|
|
model: this.openrouter.chat(this.model),
|
|
system,
|
|
messages: converted as any,
|
|
});
|
|
for await (const chunk of result.textStream) {
|
|
yield chunk;
|
|
}
|
|
}
|
|
}
|
|
|
|
// ===========================================
|
|
// AWS Bedrock Provider
|
|
// ===========================================
|
|
|
|
class BedrockProvider implements LLMProvider {
|
|
private client: BedrockRuntimeClient;
|
|
private model: string;
|
|
readonly capabilities: { supportsImages: boolean; supportsStreaming: boolean };
|
|
|
|
constructor() {
|
|
this.client = new BedrockRuntimeClient({ region: Config.AWS_REGION });
|
|
this.model = Config.BEDROCK_MODEL;
|
|
// Only Anthropic models on Bedrock support images
|
|
this.capabilities = {
|
|
supportsImages: this.isAnthropicModel(),
|
|
supportsStreaming: true,
|
|
};
|
|
}
|
|
|
|
private isAnthropicModel(): boolean {
|
|
return BEDROCK_ANTHROPIC_MODELS.some((id) => this.model.includes(id));
|
|
}
|
|
|
|
private isMetaModel(): boolean {
|
|
return BEDROCK_META_MODELS.some((id) =>
|
|
this.model.toLowerCase().includes(id)
|
|
);
|
|
}
|
|
|
|
private buildAnthropicMessages(messages: ChatMessage[]) {
|
|
const systemMsg = messages.find((m) => m.role === "system");
|
|
const nonSystem = messages.filter((m) => m.role !== "system");
|
|
|
|
const converted = nonSystem.map((msg) => {
|
|
if (typeof msg.content === "string") {
|
|
return { role: msg.role, content: msg.content };
|
|
}
|
|
const parts = msg.content.map((part) => {
|
|
if (part.type === "text") {
|
|
return { type: "text", text: part.text };
|
|
}
|
|
return {
|
|
type: "image",
|
|
source: {
|
|
type: "base64",
|
|
media_type: part.mimeType,
|
|
data: part.base64,
|
|
},
|
|
};
|
|
});
|
|
return { role: msg.role, content: parts };
|
|
});
|
|
|
|
return {
|
|
system: typeof systemMsg?.content === "string" ? systemMsg.content : "",
|
|
messages: converted,
|
|
};
|
|
}
|
|
|
|
private buildRequest(messages: ChatMessage[]): string {
|
|
if (this.isAnthropicModel()) {
|
|
const { system, messages: converted } = this.buildAnthropicMessages(messages);
|
|
return JSON.stringify({
|
|
anthropic_version: "bedrock-2023-05-31",
|
|
max_tokens: 1024,
|
|
system,
|
|
messages: converted,
|
|
});
|
|
}
|
|
|
|
// For Meta/other models, flatten to single prompt (no multi-turn / image support)
|
|
const systemContent = messages.find((m) => m.role === "system");
|
|
const userMessages = messages
|
|
.filter((m) => m.role === "user")
|
|
.map((m) =>
|
|
typeof m.content === "string"
|
|
? m.content
|
|
: m.content
|
|
.filter((p) => p.type === "text")
|
|
.map((p) => (p as { type: "text"; text: string }).text)
|
|
.join("\n")
|
|
);
|
|
const lastUserContent = userMessages[userMessages.length - 1] ?? "";
|
|
const sysText =
|
|
typeof systemContent?.content === "string" ? systemContent.content : "";
|
|
|
|
if (this.isMetaModel()) {
|
|
return JSON.stringify({
|
|
prompt: `<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n${sysText}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n${lastUserContent}\n\nRespond with ONLY a valid JSON object, no other text.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n`,
|
|
max_gen_len: 512,
|
|
temperature: 0.1,
|
|
});
|
|
}
|
|
|
|
return JSON.stringify({
|
|
inputText: `${sysText}\n\n${lastUserContent}\n\nRespond with ONLY a valid JSON object.`,
|
|
textGenerationConfig: {
|
|
maxTokenCount: 512,
|
|
temperature: 0.1,
|
|
},
|
|
});
|
|
}
|
|
|
|
private extractResponse(responseBody: Record<string, any>): string {
|
|
if (this.isAnthropicModel()) {
|
|
return responseBody.content[0].text;
|
|
}
|
|
if (this.isMetaModel()) {
|
|
return responseBody.generation ?? "";
|
|
}
|
|
return responseBody.results[0].outputText;
|
|
}
|
|
|
|
async getDecision(messages: ChatMessage[]): Promise<ActionDecision> {
|
|
const requestBody = this.buildRequest(messages);
|
|
const command = new InvokeModelCommand({
|
|
modelId: this.model,
|
|
body: new TextEncoder().encode(requestBody),
|
|
contentType: "application/json",
|
|
accept: "application/json",
|
|
});
|
|
|
|
const response = await this.client.send(command);
|
|
const responseBody = JSON.parse(new TextDecoder().decode(response.body));
|
|
const resultText = this.extractResponse(responseBody);
|
|
return parseJsonResponse(resultText);
|
|
}
|
|
|
|
async *getDecisionStream(messages: ChatMessage[]): AsyncIterable<string> {
|
|
if (!this.isAnthropicModel()) {
|
|
// Fallback: non-streaming for non-Anthropic models
|
|
const decision = await this.getDecision(messages);
|
|
yield JSON.stringify(decision);
|
|
return;
|
|
}
|
|
|
|
const { system, messages: converted } = this.buildAnthropicMessages(messages);
|
|
const requestBody = JSON.stringify({
|
|
anthropic_version: "bedrock-2023-05-31",
|
|
max_tokens: 1024,
|
|
system,
|
|
messages: converted,
|
|
});
|
|
|
|
const command = new InvokeModelWithResponseStreamCommand({
|
|
modelId: this.model,
|
|
body: new TextEncoder().encode(requestBody),
|
|
contentType: "application/json",
|
|
});
|
|
|
|
const response = await this.client.send(command);
|
|
if (response.body) {
|
|
for await (const event of response.body) {
|
|
if (event.chunk?.bytes) {
|
|
const data = JSON.parse(new TextDecoder().decode(event.chunk.bytes));
|
|
if (data.type === "content_block_delta" && data.delta?.text) {
|
|
yield data.delta.text;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// ===========================================
|
|
// Shared JSON Parsing
|
|
// ===========================================
|
|
|
|
function parseJsonResponse(text: string): ActionDecision {
|
|
try {
|
|
return JSON.parse(text);
|
|
} catch {
|
|
// Try to extract JSON from markdown code blocks or mixed text
|
|
const match = text.match(/\{[\s\S]*?\}/);
|
|
if (match) {
|
|
try {
|
|
return JSON.parse(match[0]);
|
|
} catch {
|
|
// fall through
|
|
}
|
|
}
|
|
console.log(`Warning: Could not parse LLM response: ${text.slice(0, 200)}`);
|
|
return { action: "wait", reason: "Failed to parse response, waiting" };
|
|
}
|
|
}
|
|
|
|
// ===========================================
|
|
// Factory
|
|
// ===========================================
|
|
|
|
export function getLlmProvider(): LLMProvider {
|
|
if (Config.LLM_PROVIDER === "bedrock") {
|
|
return new BedrockProvider();
|
|
}
|
|
if (Config.LLM_PROVIDER === "openrouter") {
|
|
return new OpenRouterProvider();
|
|
}
|
|
return new OpenAIProvider();
|
|
}
|