Add Ollama provider for local LLM support
Reuses OpenAIProvider via Ollama's OpenAI-compatible API at localhost:11434. No API key needed - just install Ollama, pull a model, and set LLM_PROVIDER=ollama. Vision models (llava, llama3.2-vision) supported for screenshot fallback. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
17
.env.example
17
.env.example
@@ -39,7 +39,7 @@ MAX_HISTORY_STEPS=10 # How many past steps to keep in conversation context
|
|||||||
STREAMING_ENABLED=true # Stream LLM responses (shows progress dots)
|
STREAMING_ENABLED=true # Stream LLM responses (shows progress dots)
|
||||||
|
|
||||||
# ===========================================
|
# ===========================================
|
||||||
# LLM Provider: "groq", "openai", "bedrock", or "openrouter"
|
# LLM Provider: "groq", "openai", "bedrock", "openrouter", or "ollama"
|
||||||
# ===========================================
|
# ===========================================
|
||||||
LLM_PROVIDER=groq
|
LLM_PROVIDER=groq
|
||||||
|
|
||||||
@@ -84,3 +84,18 @@ OPENROUTER_MODEL=anthropic/claude-3.5-sonnet
|
|||||||
# meta-llama/llama-3.3-70b-instruct (open source)
|
# meta-llama/llama-3.3-70b-instruct (open source)
|
||||||
# mistralai/mistral-large-latest (European)
|
# mistralai/mistral-large-latest (European)
|
||||||
# deepseek/deepseek-chat (cost efficient)
|
# deepseek/deepseek-chat (cost efficient)
|
||||||
|
|
||||||
|
# ===========================================
|
||||||
|
# Ollama Configuration (local LLMs, no API key needed)
|
||||||
|
# Install: https://ollama.com then: ollama pull llama3.2
|
||||||
|
# ===========================================
|
||||||
|
OLLAMA_BASE_URL=http://localhost:11434/v1
|
||||||
|
OLLAMA_MODEL=llama3.2
|
||||||
|
# Vision models (for screenshot support):
|
||||||
|
# llava (7B, good vision)
|
||||||
|
# llama3.2-vision (11B, best open-source vision)
|
||||||
|
# Text-only models:
|
||||||
|
# llama3.2 (3B, fast)
|
||||||
|
# llama3.1 (8B, balanced)
|
||||||
|
# qwen2.5 (7B, strong reasoning)
|
||||||
|
# mistral (7B, fast)
|
||||||
|
|||||||
@@ -27,14 +27,14 @@ Seven source files in `src/`, no subdirectories:
|
|||||||
|
|
||||||
- **kernel.ts** — Entry point and main agent loop. Reads goal from stdin, runs up to MAX_STEPS iterations of: capture screen → diff with previous → call LLM → execute action → track history. Handles stuck-loop detection and vision fallback when the accessibility tree is empty.
|
- **kernel.ts** — Entry point and main agent loop. Reads goal from stdin, runs up to MAX_STEPS iterations of: capture screen → diff with previous → call LLM → execute action → track history. Handles stuck-loop detection and vision fallback when the accessibility tree is empty.
|
||||||
- **actions.ts** — 15 action implementations (tap, type, enter, swipe, home, back, wait, done, longpress, screenshot, launch, clear, clipboard_get, clipboard_set, shell). Each wraps ADB commands via `Bun.spawnSync()`. `runAdbCommand()` provides exponential backoff retry.
|
- **actions.ts** — 15 action implementations (tap, type, enter, swipe, home, back, wait, done, longpress, screenshot, launch, clear, clipboard_get, clipboard_set, shell). Each wraps ADB commands via `Bun.spawnSync()`. `runAdbCommand()` provides exponential backoff retry.
|
||||||
- **llm-providers.ts** — LLM abstraction with `LLMProvider` interface and factory (`getLlmProvider()`). Four providers: OpenAI, Groq (OpenAI-compatible endpoint), AWS Bedrock (Anthropic + Meta model formats), OpenRouter (Vercel AI SDK). Contains the full SYSTEM_PROMPT with all 15 action definitions and rules.
|
- **llm-providers.ts** — LLM abstraction with `LLMProvider` interface and factory (`getLlmProvider()`). Five providers: OpenAI, Groq (OpenAI-compatible endpoint), Ollama (local LLMs, OpenAI-compatible), AWS Bedrock (Anthropic + Meta model formats), OpenRouter (Vercel AI SDK). Contains the full SYSTEM_PROMPT with all 15 action definitions and rules.
|
||||||
- **sanitizer.ts** — Parses Android Accessibility XML (via `fast-xml-parser`) into `UIElement[]`. Depth-first walk extracting bounds, center coordinates, state flags (enabled, checked, focused, etc.), and parent context. `computeScreenHash()` used for stuck-loop detection.
|
- **sanitizer.ts** — Parses Android Accessibility XML (via `fast-xml-parser`) into `UIElement[]`. Depth-first walk extracting bounds, center coordinates, state flags (enabled, checked, focused, etc.), and parent context. `computeScreenHash()` used for stuck-loop detection.
|
||||||
- **config.ts** — Singleton `Config` object reading from `process.env` with defaults from constants. `Config.validate()` checks required API keys at startup.
|
- **config.ts** — Singleton `Config` object reading from `process.env` with defaults from constants. `Config.validate()` checks required API keys at startup.
|
||||||
- **constants.ts** — All magic values: ADB keycodes, swipe coordinates (hardcoded for 1080px-wide screens), default models, file paths, agent defaults.
|
- **constants.ts** — All magic values: ADB keycodes, swipe coordinates (hardcoded for 1080px-wide screens), default models, file paths, agent defaults.
|
||||||
|
|
||||||
## Key Patterns
|
## Key Patterns
|
||||||
|
|
||||||
- **Provider factory:** `getLlmProvider()` returns the appropriate `LLMProvider` based on `Config.LLM_PROVIDER`. Groq reuses the `OpenAIProvider` class with a different base URL.
|
- **Provider factory:** `getLlmProvider()` returns the appropriate `LLMProvider` based on `Config.LLM_PROVIDER`. Groq and Ollama reuse the `OpenAIProvider` class with different base URLs.
|
||||||
- **Screen state diffing:** Hash-based comparison (id + text + center + state). After STUCK_THRESHOLD unchanged steps, recovery hints are injected into the LLM prompt.
|
- **Screen state diffing:** Hash-based comparison (id + text + center + state). After STUCK_THRESHOLD unchanged steps, recovery hints are injected into the LLM prompt.
|
||||||
- **Vision fallback:** When `getInteractiveElements()` returns empty (custom UI, WebView, Flutter), a screenshot is captured and the LLM gets a fallback context suggesting coordinate-based taps.
|
- **Vision fallback:** When `getInteractiveElements()` returns empty (custom UI, WebView, Flutter), a screenshot is captured and the LLM gets a fallback context suggesting coordinate-based taps.
|
||||||
- **LLM response parsing:** `parseJsonResponse()` handles both clean JSON and markdown-wrapped code blocks. Falls back to "wait" action on parse failure.
|
- **LLM response parsing:** `parseJsonResponse()` handles both clean JSON and markdown-wrapped code blocks. Falls back to "wait" action on parse failure.
|
||||||
@@ -56,7 +56,7 @@ Seven source files in `src/`, no subdirectories:
|
|||||||
|
|
||||||
## Environment Setup
|
## Environment Setup
|
||||||
|
|
||||||
Requires: Bun 1.0+, ADB (Android SDK Platform Tools) in PATH, an Android device connected via USB/WiFi with accessibility enabled, and an API key for at least one LLM provider (Groq, OpenAI, Bedrock, or OpenRouter).
|
Requires: Bun 1.0+, ADB (Android SDK Platform Tools) in PATH, an Android device connected via USB/WiFi with accessibility enabled, and either a local Ollama install or an API key for a cloud LLM provider (Groq, OpenAI, Bedrock, or OpenRouter).
|
||||||
|
|
||||||
Copy `.env.example` to `.env` and configure `LLM_PROVIDER` + the corresponding API key.
|
Copy `.env.example` to `.env` and configure `LLM_PROVIDER` + the corresponding API key.
|
||||||
|
|
||||||
|
|||||||
17
README.md
17
README.md
@@ -32,7 +32,7 @@ action: done (412ms)
|
|||||||
|
|
||||||
## setup
|
## setup
|
||||||
|
|
||||||
you need **bun**, **adb**, and an api key for any llm provider.
|
you need **bun**, **adb**, and either [ollama](https://ollama.com) for local models or an api key for a cloud provider.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# install adb if you don't have it
|
# install adb if you don't have it
|
||||||
@@ -42,9 +42,15 @@ bun install
|
|||||||
cp .env.example .env
|
cp .env.example .env
|
||||||
```
|
```
|
||||||
|
|
||||||
edit `.env` - fastest way to start is with groq (free tier):
|
edit `.env` - fastest way to start is with ollama (fully local, no api key):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
# option a: local with ollama (no api key needed)
|
||||||
|
ollama pull llama3.2
|
||||||
|
LLM_PROVIDER=ollama
|
||||||
|
OLLAMA_MODEL=llama3.2
|
||||||
|
|
||||||
|
# option b: cloud with groq (free tier)
|
||||||
LLM_PROVIDER=groq
|
LLM_PROVIDER=groq
|
||||||
GROQ_API_KEY=gsk_your_key_here
|
GROQ_API_KEY=gsk_your_key_here
|
||||||
```
|
```
|
||||||
@@ -189,11 +195,14 @@ name: Send WhatsApp Message
|
|||||||
|
|
||||||
| provider | cost | vision | notes |
|
| provider | cost | vision | notes |
|
||||||
|---|---|---|---|
|
|---|---|---|---|
|
||||||
| groq | free tier | no | fastest to start |
|
| ollama | free (local) | yes* | no api key, runs on your machine |
|
||||||
|
| groq | free tier | no | fastest cloud option |
|
||||||
| openrouter | per token | yes | 200+ models |
|
| openrouter | per token | yes | 200+ models |
|
||||||
| openai | per token | yes | gpt-4o |
|
| openai | per token | yes | gpt-4o |
|
||||||
| bedrock | per token | yes | claude on aws |
|
| bedrock | per token | yes | claude on aws |
|
||||||
|
|
||||||
|
*ollama vision requires a vision model like `llama3.2-vision` or `llava`
|
||||||
|
|
||||||
## config
|
## config
|
||||||
|
|
||||||
all in `.env`:
|
all in `.env`:
|
||||||
@@ -221,7 +230,7 @@ src/
|
|||||||
skills.ts 6 multi-step skills
|
skills.ts 6 multi-step skills
|
||||||
workflow.ts workflow orchestration
|
workflow.ts workflow orchestration
|
||||||
flow.ts yaml flow runner
|
flow.ts yaml flow runner
|
||||||
llm-providers.ts 4 providers + system prompt
|
llm-providers.ts 5 providers + system prompt
|
||||||
sanitizer.ts accessibility xml parser
|
sanitizer.ts accessibility xml parser
|
||||||
config.ts env config
|
config.ts env config
|
||||||
constants.ts keycodes, coordinates
|
constants.ts keycodes, coordinates
|
||||||
|
|||||||
@@ -772,13 +772,19 @@ cp .env.example .env</pre>
|
|||||||
<div class="stepper-step">
|
<div class="stepper-step">
|
||||||
<span class="stepper-num">3</span>
|
<span class="stepper-num">3</span>
|
||||||
<h3>configure an llm provider</h3>
|
<h3>configure an llm provider</h3>
|
||||||
<p>edit <code>.env</code> - fastest way to start is groq (free tier):</p>
|
<p>edit <code>.env</code> - fastest way is ollama (fully local, no api key):</p>
|
||||||
<pre>LLM_PROVIDER=groq
|
<pre># local (no api key needed)
|
||||||
|
ollama pull llama3.2
|
||||||
|
LLM_PROVIDER=ollama
|
||||||
|
|
||||||
|
# or cloud (free tier)
|
||||||
|
LLM_PROVIDER=groq
|
||||||
GROQ_API_KEY=gsk_your_key_here</pre>
|
GROQ_API_KEY=gsk_your_key_here</pre>
|
||||||
<table>
|
<table>
|
||||||
<thead><tr><th>provider</th><th>cost</th><th>vision</th><th>notes</th></tr></thead>
|
<thead><tr><th>provider</th><th>cost</th><th>vision</th><th>notes</th></tr></thead>
|
||||||
<tbody>
|
<tbody>
|
||||||
<tr><td>groq</td><td>free</td><td>no</td><td>fastest to start</td></tr>
|
<tr><td>ollama</td><td>free (local)</td><td>yes*</td><td>no api key, runs on your machine</td></tr>
|
||||||
|
<tr><td>groq</td><td>free</td><td>no</td><td>fastest cloud option</td></tr>
|
||||||
<tr><td>openrouter</td><td>per token</td><td>yes</td><td>200+ models</td></tr>
|
<tr><td>openrouter</td><td>per token</td><td>yes</td><td>200+ models</td></tr>
|
||||||
<tr><td>openai</td><td>per token</td><td>yes</td><td>gpt-4o</td></tr>
|
<tr><td>openai</td><td>per token</td><td>yes</td><td>gpt-4o</td></tr>
|
||||||
<tr><td>bedrock</td><td>per token</td><td>yes</td><td>claude on aws</td></tr>
|
<tr><td>bedrock</td><td>per token</td><td>yes</td><td>claude on aws</td></tr>
|
||||||
@@ -947,7 +953,7 @@ actions.ts 22 actions + adb retry
|
|||||||
skills.ts 6 multi-step skills
|
skills.ts 6 multi-step skills
|
||||||
workflow.ts workflow orchestration
|
workflow.ts workflow orchestration
|
||||||
flow.ts yaml flow runner
|
flow.ts yaml flow runner
|
||||||
llm-providers.ts 4 providers + system prompt
|
llm-providers.ts 5 providers + system prompt
|
||||||
sanitizer.ts accessibility xml parser
|
sanitizer.ts accessibility xml parser
|
||||||
config.ts env config
|
config.ts env config
|
||||||
constants.ts keycodes, coordinates
|
constants.ts keycodes, coordinates
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ import {
|
|||||||
DEFAULT_GROQ_MODEL,
|
DEFAULT_GROQ_MODEL,
|
||||||
DEFAULT_OPENAI_MODEL,
|
DEFAULT_OPENAI_MODEL,
|
||||||
DEFAULT_BEDROCK_MODEL,
|
DEFAULT_BEDROCK_MODEL,
|
||||||
|
DEFAULT_OLLAMA_MODEL,
|
||||||
DEFAULT_MAX_RETRIES,
|
DEFAULT_MAX_RETRIES,
|
||||||
DEFAULT_STUCK_THRESHOLD,
|
DEFAULT_STUCK_THRESHOLD,
|
||||||
DEFAULT_MAX_ELEMENTS,
|
DEFAULT_MAX_ELEMENTS,
|
||||||
@@ -56,7 +57,7 @@ export const Config = {
|
|||||||
// Streaming responses
|
// Streaming responses
|
||||||
STREAMING_ENABLED: env("STREAMING_ENABLED", String(DEFAULT_STREAMING_ENABLED)) === "true",
|
STREAMING_ENABLED: env("STREAMING_ENABLED", String(DEFAULT_STREAMING_ENABLED)) === "true",
|
||||||
|
|
||||||
// LLM Provider: "groq", "openai", "bedrock", or "openrouter"
|
// LLM Provider: "groq", "openai", "bedrock", "openrouter", or "ollama"
|
||||||
LLM_PROVIDER: env("LLM_PROVIDER", "groq"),
|
LLM_PROVIDER: env("LLM_PROVIDER", "groq"),
|
||||||
|
|
||||||
// Groq Configuration
|
// Groq Configuration
|
||||||
@@ -75,11 +76,16 @@ export const Config = {
|
|||||||
OPENROUTER_API_KEY: env("OPENROUTER_API_KEY"),
|
OPENROUTER_API_KEY: env("OPENROUTER_API_KEY"),
|
||||||
OPENROUTER_MODEL: env("OPENROUTER_MODEL", "anthropic/claude-3.5-sonnet"),
|
OPENROUTER_MODEL: env("OPENROUTER_MODEL", "anthropic/claude-3.5-sonnet"),
|
||||||
|
|
||||||
|
// Ollama Configuration (local LLMs, no API key needed)
|
||||||
|
OLLAMA_BASE_URL: env("OLLAMA_BASE_URL", "http://localhost:11434/v1"),
|
||||||
|
OLLAMA_MODEL: env("OLLAMA_MODEL", DEFAULT_OLLAMA_MODEL),
|
||||||
|
|
||||||
getModel(): string {
|
getModel(): string {
|
||||||
const provider = Config.LLM_PROVIDER;
|
const provider = Config.LLM_PROVIDER;
|
||||||
if (provider === "groq") return Config.GROQ_MODEL;
|
if (provider === "groq") return Config.GROQ_MODEL;
|
||||||
if (provider === "bedrock") return Config.BEDROCK_MODEL;
|
if (provider === "bedrock") return Config.BEDROCK_MODEL;
|
||||||
if (provider === "openrouter") return Config.OPENROUTER_MODEL;
|
if (provider === "openrouter") return Config.OPENROUTER_MODEL;
|
||||||
|
if (provider === "ollama") return Config.OLLAMA_MODEL;
|
||||||
return Config.OPENAI_MODEL;
|
return Config.OPENAI_MODEL;
|
||||||
},
|
},
|
||||||
|
|
||||||
|
|||||||
@@ -7,6 +7,7 @@
|
|||||||
// API Endpoints
|
// API Endpoints
|
||||||
// ===========================================
|
// ===========================================
|
||||||
export const GROQ_API_BASE_URL = "https://api.groq.com/openai/v1";
|
export const GROQ_API_BASE_URL = "https://api.groq.com/openai/v1";
|
||||||
|
export const OLLAMA_API_BASE_URL = "http://localhost:11434/v1";
|
||||||
|
|
||||||
// ===========================================
|
// ===========================================
|
||||||
// ADB Key Codes
|
// ADB Key Codes
|
||||||
@@ -80,6 +81,7 @@ export const DEFAULT_GROQ_MODEL = "llama-3.3-70b-versatile";
|
|||||||
export const DEFAULT_OPENAI_MODEL = "gpt-4o";
|
export const DEFAULT_OPENAI_MODEL = "gpt-4o";
|
||||||
export const DEFAULT_BEDROCK_MODEL = "us.meta.llama3-3-70b-instruct-v1:0";
|
export const DEFAULT_BEDROCK_MODEL = "us.meta.llama3-3-70b-instruct-v1:0";
|
||||||
export const DEFAULT_OPENROUTER_MODEL = "anthropic/claude-3.5-sonnet";
|
export const DEFAULT_OPENROUTER_MODEL = "anthropic/claude-3.5-sonnet";
|
||||||
|
export const DEFAULT_OLLAMA_MODEL = "llama3.2";
|
||||||
|
|
||||||
// ===========================================
|
// ===========================================
|
||||||
// Bedrock Model Identifiers
|
// Bedrock Model Identifiers
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
/**
|
/**
|
||||||
* LLM Provider module for DroidClaw.
|
* LLM Provider module for DroidClaw.
|
||||||
* Supports OpenAI, Groq, AWS Bedrock, and OpenRouter (via Vercel AI SDK).
|
* Supports OpenAI, Groq, AWS Bedrock, OpenRouter (via Vercel AI SDK), and Ollama (local).
|
||||||
*
|
*
|
||||||
* Phase 3: Real multimodal vision (image content parts)
|
* Phase 3: Real multimodal vision (image content parts)
|
||||||
* Phase 4A: Multi-turn conversation memory (ChatMessage[] interface)
|
* Phase 4A: Multi-turn conversation memory (ChatMessage[] interface)
|
||||||
@@ -20,6 +20,7 @@ import { z } from "zod";
|
|||||||
import { Config } from "./config.js";
|
import { Config } from "./config.js";
|
||||||
import {
|
import {
|
||||||
GROQ_API_BASE_URL,
|
GROQ_API_BASE_URL,
|
||||||
|
OLLAMA_API_BASE_URL,
|
||||||
BEDROCK_ANTHROPIC_MODELS,
|
BEDROCK_ANTHROPIC_MODELS,
|
||||||
BEDROCK_META_MODELS,
|
BEDROCK_META_MODELS,
|
||||||
} from "./constants.js";
|
} from "./constants.js";
|
||||||
@@ -265,6 +266,14 @@ class OpenAIProvider implements LLMProvider {
|
|||||||
});
|
});
|
||||||
this.model = Config.GROQ_MODEL;
|
this.model = Config.GROQ_MODEL;
|
||||||
this.capabilities = { supportsImages: false, supportsStreaming: true };
|
this.capabilities = { supportsImages: false, supportsStreaming: true };
|
||||||
|
} else if (Config.LLM_PROVIDER === "ollama") {
|
||||||
|
this.client = new OpenAI({
|
||||||
|
apiKey: "ollama", // required by the SDK but ignored by Ollama
|
||||||
|
baseURL: Config.OLLAMA_BASE_URL,
|
||||||
|
});
|
||||||
|
this.model = Config.OLLAMA_MODEL;
|
||||||
|
// Vision models (llava, llama3.2-vision, etc.) support images
|
||||||
|
this.capabilities = { supportsImages: true, supportsStreaming: true };
|
||||||
} else {
|
} else {
|
||||||
this.client = new OpenAI({ apiKey: Config.OPENAI_API_KEY });
|
this.client = new OpenAI({ apiKey: Config.OPENAI_API_KEY });
|
||||||
this.model = Config.OPENAI_MODEL;
|
this.model = Config.OPENAI_MODEL;
|
||||||
@@ -646,5 +655,6 @@ export function getLlmProvider(): LLMProvider {
|
|||||||
if (Config.LLM_PROVIDER === "openrouter") {
|
if (Config.LLM_PROVIDER === "openrouter") {
|
||||||
return new OpenRouterProvider();
|
return new OpenRouterProvider();
|
||||||
}
|
}
|
||||||
|
// OpenAI, Groq, and Ollama all use OpenAI-compatible API
|
||||||
return new OpenAIProvider();
|
return new OpenAIProvider();
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user