Flatten project structure: move android-action-kernel/ to root
Removes the unnecessary nesting — all source, config, and docs now live at the project root for simpler paths and commands. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
535
src/llm-providers.ts
Normal file
535
src/llm-providers.ts
Normal file
@@ -0,0 +1,535 @@
|
||||
/**
|
||||
* LLM Provider module for Android Action Kernel.
|
||||
* Supports OpenAI, Groq, AWS Bedrock, and OpenRouter (via Vercel AI SDK).
|
||||
*
|
||||
* Phase 3: Real multimodal vision (image content parts)
|
||||
* Phase 4A: Multi-turn conversation memory (ChatMessage[] interface)
|
||||
* Phase 5: Streaming responses (getDecisionStream)
|
||||
*/
|
||||
|
||||
import OpenAI from "openai";
|
||||
import {
|
||||
BedrockRuntimeClient,
|
||||
InvokeModelCommand,
|
||||
InvokeModelWithResponseStreamCommand,
|
||||
} from "@aws-sdk/client-bedrock-runtime";
|
||||
import { generateText, streamText } from "ai";
|
||||
import { createOpenRouter } from "@openrouter/ai-sdk-provider";
|
||||
|
||||
import { Config } from "./config.js";
|
||||
import {
|
||||
GROQ_API_BASE_URL,
|
||||
BEDROCK_ANTHROPIC_MODELS,
|
||||
BEDROCK_META_MODELS,
|
||||
} from "./constants.js";
|
||||
import type { ActionDecision } from "./actions.js";
|
||||
|
||||
// ===========================================
|
||||
// System Prompt — all 15 actions + planning
|
||||
// ===========================================
|
||||
|
||||
export const SYSTEM_PROMPT = `You are an Android Driver Agent. Your job is to achieve the user's goal by navigating the Android UI.
|
||||
|
||||
You will receive:
|
||||
1. GOAL — the user's task.
|
||||
2. FOREGROUND_APP — the currently active app package and activity.
|
||||
3. SCREEN_CONTEXT — JSON array of interactive UI elements with coordinates and states.
|
||||
4. SCREENSHOT — an image of the current screen (when available).
|
||||
5. SCREEN_CHANGE — what changed since your last action (or if the screen is stuck).
|
||||
6. VISION_FALLBACK — present when the accessibility tree is empty (custom UI / WebView).
|
||||
|
||||
Previous conversation turns contain your earlier observations and actions (multi-turn memory).
|
||||
|
||||
You must output ONLY a valid JSON object with your next action.
|
||||
|
||||
═══════════════════════════════════════════
|
||||
THINKING & PLANNING
|
||||
═══════════════════════════════════════════
|
||||
|
||||
Before each action, include a "think" field with your reasoning about the current state and what to do next.
|
||||
|
||||
Optionally include:
|
||||
- "plan": an array of 3-5 high-level steps to achieve the goal
|
||||
- "planProgress": a brief note on which plan step you're currently on
|
||||
|
||||
Example:
|
||||
{"think": "I see the Settings app is open. I need to scroll down to find Display settings.", "plan": ["Open Settings", "Navigate to Display", "Change theme to dark", "Verify change"], "planProgress": "Step 2: navigating to Display", "action": "swipe", "direction": "up", "reason": "Scroll down to find Display option"}
|
||||
|
||||
═══════════════════════════════════════════
|
||||
AVAILABLE ACTIONS (15 total)
|
||||
═══════════════════════════════════════════
|
||||
|
||||
Navigation:
|
||||
{"action": "tap", "coordinates": [x, y], "reason": "..."}
|
||||
{"action": "longpress", "coordinates": [x, y], "reason": "..."}
|
||||
{"action": "swipe", "direction": "up|down|left|right", "reason": "..."}
|
||||
{"action": "enter", "reason": "Press Enter/submit"}
|
||||
{"action": "back", "reason": "Navigate back"}
|
||||
{"action": "home", "reason": "Go to home screen"}
|
||||
|
||||
Text Input:
|
||||
{"action": "type", "text": "Hello World", "reason": "..."}
|
||||
{"action": "clear", "reason": "Clear current text field before typing"}
|
||||
|
||||
App Control:
|
||||
{"action": "launch", "package": "com.whatsapp", "reason": "Open WhatsApp"}
|
||||
{"action": "launch", "uri": "https://maps.google.com/?q=pizza", "reason": "Open URL"}
|
||||
{"action": "launch", "package": "com.whatsapp", "uri": "content://media/external/images/1", "extras": {"android.intent.extra.TEXT": "Check this"}, "reason": "Share image to WhatsApp"}
|
||||
|
||||
Data:
|
||||
{"action": "screenshot", "reason": "Capture current screen"}
|
||||
{"action": "screenshot", "filename": "order_confirmation.png", "reason": "Save proof"}
|
||||
{"action": "clipboard_get", "reason": "Read clipboard contents"}
|
||||
{"action": "clipboard_set", "text": "copied text", "reason": "Set clipboard"}
|
||||
|
||||
System:
|
||||
{"action": "shell", "command": "am force-stop com.app.broken", "reason": "Kill crashed app"}
|
||||
{"action": "wait", "reason": "Wait for screen to load"}
|
||||
{"action": "done", "reason": "Task is complete"}
|
||||
|
||||
═══════════════════════════════════════════
|
||||
ELEMENT PROPERTIES YOU WILL SEE
|
||||
═══════════════════════════════════════════
|
||||
|
||||
Each element in SCREEN_CONTEXT has:
|
||||
- text: visible label or content description
|
||||
- center: [x, y] coordinates to tap
|
||||
- action: suggested action — "tap", "type", "longpress", "scroll", or "read"
|
||||
- enabled: false (only shown when disabled — DO NOT tap disabled elements!)
|
||||
- checked: true (only shown for ON checkboxes/toggles)
|
||||
- focused: true (only shown when field has input focus)
|
||||
- hint: placeholder text (only shown when present)
|
||||
- editable: true (only shown for text input fields)
|
||||
- scrollable: true (only shown for scrollable containers)
|
||||
|
||||
═══════════════════════════════════════════
|
||||
CRITICAL RULES
|
||||
═══════════════════════════════════════════
|
||||
|
||||
1. DISABLED ELEMENTS: If "enabled": false, DO NOT tap or interact with it. Find an alternative.
|
||||
2. TEXT INPUT: If "editable": true, use "clear" first if field has existing text, then "type".
|
||||
3. ALREADY TYPED: Check your previous actions. Do NOT re-type text you already entered.
|
||||
4. REPETITION: Do NOT tap the same coordinates twice in a row. If it didn't work, try something else.
|
||||
5. STUCK: If SCREEN_CHANGE says "NOT changed", your last action had no effect. Change strategy.
|
||||
6. APP LAUNCH: Use "launch" to directly open apps instead of hunting for icons on the home screen.
|
||||
7. SCREENSHOTS: Use "screenshot" to capture proof of completed tasks (order confirmations, etc).
|
||||
8. LONG PRESS: Use "longpress" when you see "longClickable": true (context menus, copy/paste, etc).
|
||||
9. SCROLLING: If the item you need isn't visible, "swipe" up/down to scroll and find it.
|
||||
10. MULTI-APP: To switch apps, use "home" then "launch" the next app. Or use "back" to return.
|
||||
11. PASSWORDS: Never log or output the text of password fields.
|
||||
12. DONE: Say "done" as soon as the goal is achieved. Don't keep acting after success.
|
||||
13. SEARCH: After typing in a search field, use "enter" to submit the search.
|
||||
14. SHARE: To send files/images between apps, use "launch" with uri + extras for Android intents.
|
||||
15. CLEANUP: If a popup/ad appears, dismiss it with "back" or tap the close button, then continue.`;
|
||||
|
||||
// ===========================================
|
||||
// Chat Message Types (Phase 4A)
|
||||
// ===========================================
|
||||
|
||||
export type ContentPart =
|
||||
| { type: "text"; text: string }
|
||||
| { type: "image"; base64: string; mimeType: "image/png" | "image/jpeg" };
|
||||
|
||||
export interface ChatMessage {
|
||||
role: "system" | "user" | "assistant";
|
||||
content: string | ContentPart[];
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Provider Interface
|
||||
// ===========================================
|
||||
|
||||
export interface LLMProvider {
|
||||
readonly capabilities: {
|
||||
supportsImages: boolean;
|
||||
supportsStreaming: boolean;
|
||||
};
|
||||
getDecision(messages: ChatMessage[]): Promise<ActionDecision>;
|
||||
getDecisionStream?(messages: ChatMessage[]): AsyncIterable<string>;
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Message Trimming (Phase 4A)
|
||||
// ===========================================
|
||||
|
||||
/**
|
||||
* Trims conversation messages to keep within history limit.
|
||||
* Always keeps the system message. Drops oldest user/assistant pairs.
|
||||
*/
|
||||
export function trimMessages(
|
||||
messages: ChatMessage[],
|
||||
maxHistorySteps: number
|
||||
): ChatMessage[] {
|
||||
if (messages.length === 0) return messages;
|
||||
|
||||
// System message is always first
|
||||
const system = messages[0].role === "system" ? messages[0] : null;
|
||||
const rest = system ? messages.slice(1) : messages;
|
||||
|
||||
// Count user/assistant pairs (each step = 1 user + 1 assistant)
|
||||
const maxMessages = maxHistorySteps * 2;
|
||||
if (rest.length <= maxMessages) {
|
||||
return messages;
|
||||
}
|
||||
|
||||
const dropped = rest.length - maxMessages;
|
||||
const stepsDropped = Math.floor(dropped / 2);
|
||||
const trimmed = rest.slice(dropped);
|
||||
|
||||
// Insert a summary note
|
||||
const summary: ChatMessage = {
|
||||
role: "user",
|
||||
content: `[${stepsDropped} earlier steps omitted]`,
|
||||
};
|
||||
|
||||
return system ? [system, summary, ...trimmed] : [summary, ...trimmed];
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// OpenAI / Groq Provider
|
||||
// ===========================================
|
||||
|
||||
class OpenAIProvider implements LLMProvider {
|
||||
private client: OpenAI;
|
||||
private model: string;
|
||||
readonly capabilities: { supportsImages: boolean; supportsStreaming: boolean };
|
||||
|
||||
constructor() {
|
||||
if (Config.LLM_PROVIDER === "groq") {
|
||||
this.client = new OpenAI({
|
||||
apiKey: Config.GROQ_API_KEY,
|
||||
baseURL: GROQ_API_BASE_URL,
|
||||
});
|
||||
this.model = Config.GROQ_MODEL;
|
||||
this.capabilities = { supportsImages: false, supportsStreaming: true };
|
||||
} else {
|
||||
this.client = new OpenAI({ apiKey: Config.OPENAI_API_KEY });
|
||||
this.model = Config.OPENAI_MODEL;
|
||||
this.capabilities = { supportsImages: true, supportsStreaming: true };
|
||||
}
|
||||
}
|
||||
|
||||
private toOpenAIMessages(
|
||||
messages: ChatMessage[]
|
||||
): OpenAI.ChatCompletionMessageParam[] {
|
||||
return messages.map((msg) => {
|
||||
if (typeof msg.content === "string") {
|
||||
return { role: msg.role, content: msg.content } as OpenAI.ChatCompletionMessageParam;
|
||||
}
|
||||
// Convert ContentPart[] to OpenAI format
|
||||
const parts: OpenAI.ChatCompletionContentPart[] = msg.content.map(
|
||||
(part) => {
|
||||
if (part.type === "text") {
|
||||
return { type: "text" as const, text: part.text };
|
||||
}
|
||||
// Image — only for OpenAI (Groq skips images)
|
||||
if (this.capabilities.supportsImages) {
|
||||
return {
|
||||
type: "image_url" as const,
|
||||
image_url: {
|
||||
url: `data:${part.mimeType};base64,${part.base64}`,
|
||||
detail: "low" as const,
|
||||
},
|
||||
};
|
||||
}
|
||||
// Groq: convert image to text placeholder
|
||||
return { type: "text" as const, text: "[Screenshot attached]" };
|
||||
}
|
||||
);
|
||||
return {
|
||||
role: msg.role,
|
||||
content: parts,
|
||||
} as OpenAI.ChatCompletionMessageParam;
|
||||
});
|
||||
}
|
||||
|
||||
async getDecision(messages: ChatMessage[]): Promise<ActionDecision> {
|
||||
const openaiMessages = this.toOpenAIMessages(messages);
|
||||
const response = await this.client.chat.completions.create({
|
||||
model: this.model,
|
||||
response_format: { type: "json_object" },
|
||||
messages: openaiMessages,
|
||||
});
|
||||
return parseJsonResponse(response.choices[0].message.content ?? "{}");
|
||||
}
|
||||
|
||||
async *getDecisionStream(messages: ChatMessage[]): AsyncIterable<string> {
|
||||
const openaiMessages = this.toOpenAIMessages(messages);
|
||||
const stream = await this.client.chat.completions.create({
|
||||
model: this.model,
|
||||
response_format: { type: "json_object" },
|
||||
messages: openaiMessages,
|
||||
stream: true,
|
||||
});
|
||||
for await (const chunk of stream) {
|
||||
const content = chunk.choices[0]?.delta?.content;
|
||||
if (content) yield content;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// OpenRouter Provider (Vercel AI SDK)
|
||||
// ===========================================
|
||||
|
||||
class OpenRouterProvider implements LLMProvider {
|
||||
private openrouter: ReturnType<typeof createOpenRouter>;
|
||||
private model: string;
|
||||
readonly capabilities = { supportsImages: true, supportsStreaming: true };
|
||||
|
||||
constructor() {
|
||||
this.openrouter = createOpenRouter({
|
||||
apiKey: Config.OPENROUTER_API_KEY,
|
||||
});
|
||||
this.model = Config.OPENROUTER_MODEL;
|
||||
}
|
||||
|
||||
private toVercelMessages(messages: ChatMessage[]) {
|
||||
// Vercel AI SDK uses a similar format but we need to convert images
|
||||
const systemMsg = messages.find((m) => m.role === "system");
|
||||
const nonSystem = messages.filter((m) => m.role !== "system");
|
||||
|
||||
const converted = nonSystem.map((msg) => {
|
||||
if (typeof msg.content === "string") {
|
||||
return { role: msg.role as "user" | "assistant", content: msg.content };
|
||||
}
|
||||
const parts = msg.content.map((part) => {
|
||||
if (part.type === "text") {
|
||||
return { type: "text" as const, text: part.text };
|
||||
}
|
||||
return {
|
||||
type: "image" as const,
|
||||
image: `data:${part.mimeType};base64,${part.base64}`,
|
||||
};
|
||||
});
|
||||
return { role: msg.role as "user" | "assistant", content: parts };
|
||||
});
|
||||
|
||||
return {
|
||||
system: typeof systemMsg?.content === "string" ? systemMsg.content : "",
|
||||
messages: converted,
|
||||
};
|
||||
}
|
||||
|
||||
async getDecision(messages: ChatMessage[]): Promise<ActionDecision> {
|
||||
const { system, messages: converted } = this.toVercelMessages(messages);
|
||||
const result = await generateText({
|
||||
model: this.openrouter.chat(this.model),
|
||||
system,
|
||||
messages: converted as any,
|
||||
});
|
||||
return parseJsonResponse(result.text);
|
||||
}
|
||||
|
||||
async *getDecisionStream(messages: ChatMessage[]): AsyncIterable<string> {
|
||||
const { system, messages: converted } = this.toVercelMessages(messages);
|
||||
const result = streamText({
|
||||
model: this.openrouter.chat(this.model),
|
||||
system,
|
||||
messages: converted as any,
|
||||
});
|
||||
for await (const chunk of result.textStream) {
|
||||
yield chunk;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// AWS Bedrock Provider
|
||||
// ===========================================
|
||||
|
||||
class BedrockProvider implements LLMProvider {
|
||||
private client: BedrockRuntimeClient;
|
||||
private model: string;
|
||||
readonly capabilities: { supportsImages: boolean; supportsStreaming: boolean };
|
||||
|
||||
constructor() {
|
||||
this.client = new BedrockRuntimeClient({ region: Config.AWS_REGION });
|
||||
this.model = Config.BEDROCK_MODEL;
|
||||
// Only Anthropic models on Bedrock support images
|
||||
this.capabilities = {
|
||||
supportsImages: this.isAnthropicModel(),
|
||||
supportsStreaming: true,
|
||||
};
|
||||
}
|
||||
|
||||
private isAnthropicModel(): boolean {
|
||||
return BEDROCK_ANTHROPIC_MODELS.some((id) => this.model.includes(id));
|
||||
}
|
||||
|
||||
private isMetaModel(): boolean {
|
||||
return BEDROCK_META_MODELS.some((id) =>
|
||||
this.model.toLowerCase().includes(id)
|
||||
);
|
||||
}
|
||||
|
||||
private buildAnthropicMessages(messages: ChatMessage[]) {
|
||||
const systemMsg = messages.find((m) => m.role === "system");
|
||||
const nonSystem = messages.filter((m) => m.role !== "system");
|
||||
|
||||
const converted = nonSystem.map((msg) => {
|
||||
if (typeof msg.content === "string") {
|
||||
return { role: msg.role, content: msg.content };
|
||||
}
|
||||
const parts = msg.content.map((part) => {
|
||||
if (part.type === "text") {
|
||||
return { type: "text", text: part.text };
|
||||
}
|
||||
return {
|
||||
type: "image",
|
||||
source: {
|
||||
type: "base64",
|
||||
media_type: part.mimeType,
|
||||
data: part.base64,
|
||||
},
|
||||
};
|
||||
});
|
||||
return { role: msg.role, content: parts };
|
||||
});
|
||||
|
||||
return {
|
||||
system: typeof systemMsg?.content === "string" ? systemMsg.content : "",
|
||||
messages: converted,
|
||||
};
|
||||
}
|
||||
|
||||
private buildRequest(messages: ChatMessage[]): string {
|
||||
if (this.isAnthropicModel()) {
|
||||
const { system, messages: converted } = this.buildAnthropicMessages(messages);
|
||||
return JSON.stringify({
|
||||
anthropic_version: "bedrock-2023-05-31",
|
||||
max_tokens: 1024,
|
||||
system,
|
||||
messages: converted,
|
||||
});
|
||||
}
|
||||
|
||||
// For Meta/other models, flatten to single prompt (no multi-turn / image support)
|
||||
const systemContent = messages.find((m) => m.role === "system");
|
||||
const userMessages = messages
|
||||
.filter((m) => m.role === "user")
|
||||
.map((m) =>
|
||||
typeof m.content === "string"
|
||||
? m.content
|
||||
: m.content
|
||||
.filter((p) => p.type === "text")
|
||||
.map((p) => (p as { type: "text"; text: string }).text)
|
||||
.join("\n")
|
||||
);
|
||||
const lastUserContent = userMessages[userMessages.length - 1] ?? "";
|
||||
const sysText =
|
||||
typeof systemContent?.content === "string" ? systemContent.content : "";
|
||||
|
||||
if (this.isMetaModel()) {
|
||||
return JSON.stringify({
|
||||
prompt: `<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n${sysText}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n${lastUserContent}\n\nRespond with ONLY a valid JSON object, no other text.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n`,
|
||||
max_gen_len: 512,
|
||||
temperature: 0.1,
|
||||
});
|
||||
}
|
||||
|
||||
return JSON.stringify({
|
||||
inputText: `${sysText}\n\n${lastUserContent}\n\nRespond with ONLY a valid JSON object.`,
|
||||
textGenerationConfig: {
|
||||
maxTokenCount: 512,
|
||||
temperature: 0.1,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
private extractResponse(responseBody: Record<string, any>): string {
|
||||
if (this.isAnthropicModel()) {
|
||||
return responseBody.content[0].text;
|
||||
}
|
||||
if (this.isMetaModel()) {
|
||||
return responseBody.generation ?? "";
|
||||
}
|
||||
return responseBody.results[0].outputText;
|
||||
}
|
||||
|
||||
async getDecision(messages: ChatMessage[]): Promise<ActionDecision> {
|
||||
const requestBody = this.buildRequest(messages);
|
||||
const command = new InvokeModelCommand({
|
||||
modelId: this.model,
|
||||
body: new TextEncoder().encode(requestBody),
|
||||
contentType: "application/json",
|
||||
accept: "application/json",
|
||||
});
|
||||
|
||||
const response = await this.client.send(command);
|
||||
const responseBody = JSON.parse(new TextDecoder().decode(response.body));
|
||||
const resultText = this.extractResponse(responseBody);
|
||||
return parseJsonResponse(resultText);
|
||||
}
|
||||
|
||||
async *getDecisionStream(messages: ChatMessage[]): AsyncIterable<string> {
|
||||
if (!this.isAnthropicModel()) {
|
||||
// Fallback: non-streaming for non-Anthropic models
|
||||
const decision = await this.getDecision(messages);
|
||||
yield JSON.stringify(decision);
|
||||
return;
|
||||
}
|
||||
|
||||
const { system, messages: converted } = this.buildAnthropicMessages(messages);
|
||||
const requestBody = JSON.stringify({
|
||||
anthropic_version: "bedrock-2023-05-31",
|
||||
max_tokens: 1024,
|
||||
system,
|
||||
messages: converted,
|
||||
});
|
||||
|
||||
const command = new InvokeModelWithResponseStreamCommand({
|
||||
modelId: this.model,
|
||||
body: new TextEncoder().encode(requestBody),
|
||||
contentType: "application/json",
|
||||
});
|
||||
|
||||
const response = await this.client.send(command);
|
||||
if (response.body) {
|
||||
for await (const event of response.body) {
|
||||
if (event.chunk?.bytes) {
|
||||
const data = JSON.parse(new TextDecoder().decode(event.chunk.bytes));
|
||||
if (data.type === "content_block_delta" && data.delta?.text) {
|
||||
yield data.delta.text;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Shared JSON Parsing
|
||||
// ===========================================
|
||||
|
||||
function parseJsonResponse(text: string): ActionDecision {
|
||||
try {
|
||||
return JSON.parse(text);
|
||||
} catch {
|
||||
// Try to extract JSON from markdown code blocks or mixed text
|
||||
const match = text.match(/\{[\s\S]*?\}/);
|
||||
if (match) {
|
||||
try {
|
||||
return JSON.parse(match[0]);
|
||||
} catch {
|
||||
// fall through
|
||||
}
|
||||
}
|
||||
console.log(`Warning: Could not parse LLM response: ${text.slice(0, 200)}`);
|
||||
return { action: "wait", reason: "Failed to parse response, waiting" };
|
||||
}
|
||||
}
|
||||
|
||||
// ===========================================
|
||||
// Factory
|
||||
// ===========================================
|
||||
|
||||
export function getLlmProvider(): LLMProvider {
|
||||
if (Config.LLM_PROVIDER === "bedrock") {
|
||||
return new BedrockProvider();
|
||||
}
|
||||
if (Config.LLM_PROVIDER === "openrouter") {
|
||||
return new OpenRouterProvider();
|
||||
}
|
||||
return new OpenAIProvider();
|
||||
}
|
||||
Reference in New Issue
Block a user