diff --git a/.env.example b/.env.example
index 10462e7..2784c41 100644
--- a/.env.example
+++ b/.env.example
@@ -2,137 +2,74 @@
 # Copy this file to .env and fill in your values before running install.sh
 # cp .env.example .env
 #
-# ── VaultWarden Integration ──────────────────────────────────────────────────────
+# ── Bitwarden / VaultWarden Integration ───────────────────────────────────────
 # This project supports <vaultwarden:path> placeholders in .env values.
-# The start.sh script automatically resolves them using the `bw` CLI before
-# starting the stack.
-#
-# Placeholder format: <vaultwarden:collection/path>
+# The install.sh script can set this up for you.
+# Placeholder format: <vaultwarden:organization-id/item-name>
 #   Examples:
-#     <vaultwarden:ai-cluster/services/open-webui>
 #     <vaultwarden:ai-cluster/infra/anthropic-api>
+#     <vaultwarden:ai-cluster/services/litellm>
 #
 # Requirements:
-#   - Bitwarden CLI installed: https://bitwarden.com/download/
-#   - Vault unlocked, or BW_CLIENT_ID + BW_CLIENT_SECRET + VAULT_MASTER_PASSWORD set
-#   - Organization ID: f8a8b00f-496a-44d3-b9d6-5ed28ecd95a3
+#   - Bitwarden CLI installed (npm install -g @bitwarden/cli)
+#   - Vault unlocked, or BW_CLIENT_ID + BW_CLIENT_SECRET set
+#
+# For self-hosted VaultWarden, set:
+# BW_SERVER_URL=https://vaultwarden.example.com
 #
-# To manually resolve: ./scripts/resolve-vaultwarden.sh --in-place
+# To resolve manually: ./scripts/resolve-vaultwarden.sh
 
 # ─── Host paths ───────────────────────────────────────────────────────────────
-# User who will run the stack (must be in docker group)
 STACK_USER=yourusername
-
-# Where Ollama models and config are stored on the host
 OLLAMA_DATA=/home/${STACK_USER}/.ollama
 
 # ─── Intel GPU device nodes ───────────────────────────────────────────────────
-# Run: ls -la /dev/dri/ to find your card node (card0 or card1)
-# On Meteor Lake / Arrow Lake this can drift between reboots
-# check-arc-gpu.sh will detect and correct this automatically on each boot
 GPU_CARD=/dev/dri/card1
 GPU_RENDER=/dev/dri/renderD128
 
 # ── Ollama ────────────────────────────────────────────────────────────────────
-# Port mappings (change if you have conflicts)
 OLLAMA_PORT=11434
-OLLAMA_DATA=/home/${STACK_USER}/.ollama
-# Optional: SSH key for remote model sync (leave blank if not using)
 OLLAMA_SSH_KEY=/home/${STACK_USER}/.ssh/id_ed25519_ollama
 OLLAMA_SSH_KEY_PUB=/home/${STACK_USER}/.ssh/id_ed25519_ollama.pub
 
 # ── Cloud API Keys ────────────────────────────────────────────────────────────
-ANTHROPIC_API_KEY=<vaultwarden:ai-cluster/infra/anthropic-api>
-GEMINI_API_KEY=<vaultwarden:ai-cluster/infra/gemini-api>
-OPENAI_API_KEY=<vaultwarden:ai-cluster/infra/openai-api>  # optional — leave blank if not using OpenAI
+ANTHROPIC_API_KEY=sk-ant-...
+GEMINI_API_KEY=your-gemini-key
+OPENAI_API_KEY=  # optional — leave blank if not using OpenAI
 
 # ── LiteLLM ───────────────────────────────────────────────────────────────────
 LITELLM_PORT=4000
-# Admin key — used to log into LiteLLM UI and as Open WebUI's OpenAI API key
-# Change this before exposing on any network
 LITELLM_MASTER_KEY=sk-local-admin-changeme
 
-# ── Olla ──────────────────────────────────────────────────────────────────────
+# ── Olla — unified LLM router ─────────────────────────────────────────────────
 OLLA_PORT=40114
 
 # Additional Ollama nodes on your LAN.
-# Declare each extra node as its own OLLAMA_REMOTE_* variable, since the setup
-# scripts read OLLAMA_REMOTE_* entries when generating proxy/olla.yaml and
-# registering remote connections.
-#
-# Format:
-#   OLLAMA_REMOTE_<NAME>=http://host:port[:priority]
-#
-# Priority is optional (defaults to 70). The local ollama-arc node (100) and
-# litellm-cloud (50) are always included — only add your *extra* nodes here.
-#
+# Format: OLLAMA_REMOTE_<NAME>=http://host:port[:priority]
 # Examples:
-#   One extra node:
-#     OLLAMA_REMOTE_WORKSTATION=http://192.168.1.50:11434:75
-#   Multiple nodes:
-#     OLLAMA_REMOTE_WORKSTATION=http://192.168.1.50:11434:75
-#     OLLAMA_REMOTE_NAS_BOX=http://192.168.1.51:11434:60
-#   No priority (defaults to 70):
-#     OLLAMA_REMOTE_WORKSTATION=http://192.168.1.50:11434
+#   OLLAMA_REMOTE_WORKSTATION=http://192.168.1.50:11434:75
+#   OLLAMA_REMOTE_NAS_BOX=http://192.168.1.51:11434
 
 # Advanced Olla tuning (optional — defaults shown)
-# OLLA_ENGINE=sherpa              # or "olla" for circuit breakers + connection pooling
-# OLLA_LOAD_BALANCER=least-connections   # or "round-robin" or "priority"
+# OLLA_ENGINE=sherpa
+# OLLA_LOAD_BALANCER=least-connections
 # OLLA_REQUEST_LOGGING=true
 
-# ── Open WebUI ────────────────────────────────────────────────────────────────
-WEBUI_PORT=3000
-WEBUI_NAME=AssistantOS
-WEBUI_SECRET_KEY=<vaultwarden:ai-cluster/services/open-webui>  # Generate with: python3 -c "from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())"
-
-# ── Open Terminal ─────────────────────────────────────────────────────────────
-TERMINAL_PORT=8000
-# Change these before deploying — do not use defaults in production
-OPEN_TERMINAL_API_KEY=<vaultwarden:ai-cluster/services/open-terminal>
-
-# ── Pipelines ─────────────────────────────────────────────────────────────────
-PIPELINES_PORT=9099
-PIPELINES_API_KEY=<vaultwarden:ai-cluster/services/pipelines>
+# ── Retriever — Obsidian vault RAG service ────────────────────────────────────
+# Lightweight replacement for Khoj + PostgreSQL.
+# Uses sqlite-vec (file-based), hybrid search (FTS5 + vector), watchdog live indexing.
+# API-only: no web UI. Designed as an OpenCode tool.
+RETRIEVER_PORT=42000
+RETRIEVER_VAULT_PATH=/home/${STACK_USER}/obsidian
+RETRIEVER_EMBED_MODEL=nomic-embed-text
+RETRIEVER_CHUNK_SIZE=512
+RETRIEVER_CHUNK_OVERLAP=64
 
 # ─── Remote Ollama instances ──────────────────────────────────────────────────
-# Format: OLLAMA_REMOTE_<name>=http://<ip>:11434
-# The name becomes the instance key in System Diagnostics and Open WebUI connections.
-# Add as many as you need — post-install.sh will register all of them automatically.
-#
+# These are added to Olla's routing and can be discovered by discover-herd.sh.
 # Examples:
 # OLLAMA_REMOTE_frank=http://10.10.1.1:11434
 # OLLAMA_REMOTE_lab1=http://10.10.1.2:11434
-# OLLAMA_REMOTE_lab2=http://10.10.1.3:11434
-# OLLAMA_REMOTE_lab3=http://10.10.1.4:11434
-# OLLAMA_REMOTE_lab4=http://10.10.1.5:11434
 
 # ─── Models to pull after install ─────────────────────────────────────────────
-# Space-separated list of models to pull on first run
-# Recommended stack for Intel Arc iGPU (14b models run well on 32GB shared RAM)
 # MODELS_TO_PULL="deepseek-r1:14b qwen2.5-coder:14b gemma3:12b qwen2.5:14b nomic-embed-text:latest"
-
-# ─── Khoj (AI second brain + Obsidian RAG) ────────────────────────────────────
-# Khoj indexes your Obsidian vault for semantic search and RAG over your notes.
-# See docs/khoj-setup.md for Obsidian plugin configuration.
-KHOJ_PORT=42110
-KHOJ_ADMIN_EMAIL=admin@localhost
-KHOJ_ADMIN_PASSWORD=<vaultwarden:ai-cluster/services/khoj>
-# Generate a random secret key: python3 -c "import secrets; print(secrets.token_hex(32))"
-KHOJ_DJANGO_SECRET_KEY=<vaultwarden:ai-cluster/services/khoj (notes)>  # Generate: python3 -c "from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())"
-KHOJ_DB_PASSWORD=<vaultwarden:ai-cluster/services/khoj-db>
-# Tip: generate a strong password with:
-#   python3 -c "import secrets; print(secrets.token_hex(24))"
-# Set to true to allow unauthenticated (anonymous) local access to Khoj.
-# Leave false (default) unless you are behind a trusted reverse proxy.
-KHOJ_NO_AUTH=false
-# Path to your Obsidian vault on the host machine (mounted read-only into Khoj)
-OBSIDIAN_VAULT_PATH=/home/yourusername/obsidian-vault
-
-# ─── Khoj Sync (CouchDB → Khoj live indexing) ─────────────────────────────────
-COUCHDB_URL=https://sync.yourdomain.com
-COUCHDB_DB=your-journal
-COUCHDB_USER=your-couchdb-username
-COUCHDB_PASSWORD=<vaultwarden:ai-cluster/services/khoj-sync-couchdb>
-# Set to true after first successful sync to skip full re-index on restart
-KHOJ_SYNC_SKIP_INITIAL=false
-KHOJ_SYNC_LOG_LEVEL=INFO
diff --git a/.gitignore b/.gitignore
index 98df6e0..c0f3554 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 # ── secrets & local config ────────────────────────────────────────────────────
 .env
 .env.backup*
+.env.example.backup*
 .env.*.backup*
 tmp/
 
@@ -31,3 +32,6 @@ Thumbs.db
 
 # Opencode
 AGENTS.md
+
+# Obsidian workspace config (local + auto-generated by install.sh)
+.obsidian/
diff --git a/.opencode/config.json b/.opencode/config.json
new file mode 100644
index 0000000..d2112fd
--- /dev/null
+++ b/.opencode/config.json
@@ -0,0 +1,11 @@
+{
+  "$schema": "https://opencode.ai/config.json",
+  "tools": {
+    "vault-search": true,
+    "vault-search_per_source": true
+  },
+  "permission": {
+    "vault-search": "allow",
+    "vault-search_per_source": "allow"
+  }
+}
diff --git a/.opencode/tools/vault-search.ts b/.opencode/tools/vault-search.ts
new file mode 100644
index 0000000..9c037eb
--- /dev/null
+++ b/.opencode/tools/vault-search.ts
@@ -0,0 +1,62 @@
+import { tool } from "@opencode-ai/plugin"
+
+export default tool({
+  description: "Search your Obsidian vault for notes matching a query. Uses the retriever service (sqlite-vec + FTS5 hybrid search). Returns file paths, content snippets, and relevance scores.",
+  args: {
+    query: tool.schema.string().describe("Natural language search query"),
+    top_k: tool.schema.number().default(5).describe("Number of results to return (default 5)"),
+    include_content: tool.schema.boolean().default(true).describe("Include full chunk content in results"),
+  },
+  async execute(args) {
+    const resp = await fetch("http://localhost:42000/search", {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify({ query: args.query, top_k: args.top_k }),
+    })
+    if (!resp.ok) {
+      return `Retriever error: ${resp.status} ${resp.statusText}`
+    }
+    const data = await resp.json()
+    if (!data.results || data.results.length === 0) {
+      return `No results found for: "${args.query}"`
+    }
+    return data.results.map((r: any) => {
+      let out = `## ${r.filepath} (score: ${r.score})`
+      if (r.parent_heading) out += `\nSection: ${r.parent_heading}`
+      if (args.include_content) out += `\n${r.content.slice(0, 2000)}`
+      return out
+    }).join("\n\n---\n\n")
+  },
+})
+
+export const per_source = tool({
+  description: "Search only a specific file or subdirectory in your Obsidian vault",
+  args: {
+    query: tool.schema.string().describe("Natural language search query"),
+    path_filter: tool.schema.string().describe("Filter results to a specific file or directory (e.g. 'networking/' or 'projects/ideas.md')"),
+    top_k: tool.schema.number().default(5).describe("Number of results to return (default 5)"),
+  },
+  async execute(args) {
+    const resp = await fetch("http://localhost:42000/search", {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify({ query: args.query, top_k: args.top_k * 2 }),
+    })
+    if (!resp.ok) {
+      return `Retriever error: ${resp.status} ${resp.statusText}`
+    }
+    const data = await resp.json()
+    let results = data.results || []
+    results = results.filter((r: any) => r.filepath.startsWith(args.path_filter))
+    results = results.slice(0, args.top_k)
+    if (results.length === 0) {
+      return `No results in "${args.path_filter}" for: "${args.query}"`
+    }
+    return results.map((r: any) => {
+      let out = `## ${r.filepath} (score: ${r.score})`
+      if (r.parent_heading) out += `\nSection: ${r.parent_heading}`
+      out += `\n${r.content.slice(0, 2000)}`
+      return out
+    }).join("\n\n---\n\n")
+  },
+})
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..895c124
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,139 @@
+# AGENTS.md — ai-stack
+
+This is a Docker Compose-based AI stack for **Intel Arc iGPU on Linux**, managed via systemd. The stack provides local LLM inference (ollama-arc), cloud API routing (LiteLLM), unified routing/load balancing (Olla), and Obsidian vault RAG (retriever). The primary AI interface is **OpenCode** (CLI + Obsidian sidebar plugin).
+
+## Developer commands
+
+```bash
+# Start / stop / restart (via systemd, preferred)
+sudo systemctl start|stop|restart ai-stack.service
+
+# Direct docker compose (for testing, not persistent)
+# NOTE: start.sh auto-resolves <vaultwarden:...> placeholders first
+./start.sh            # foreground
+./start.sh -d         # detached
+./start.sh down       # tear down
+
+# Regenerate Olla config after changing .env OLLAMA_REMOTE_* entries
+./scripts/generate-olla-config.sh
+
+# Resolve Bitwarden/VaultWarden placeholders in .env (auto-runs in start.sh)
+./scripts/resolve-vaultwarden.sh              # resolve in-place
+./scripts/resolve-vaultwarden.sh --dry-run    # preview only
+
+# Discover other Ollama nodes on the LAN
+./scripts/discover-herd.sh              # prompt before writing
+./scripts/discover-herd.sh --apply      # write without prompt
+./scripts/discover-herd.sh --dry-run    # scan only
+
+# Discover AI services across all networks (LAN + VPN)
+./scripts/discover-network.sh                         # interactive
+./scripts/discover-network.sh 10.10.0.201:11434       # seed(s) as args, prompts
+./scripts/discover-network.sh --apply                 # add all discovered
+./scripts/discover-network.sh --dry-run               # scan only
+
+# Check retriever status
+curl localhost:42000/health
+
+# Search the vault
+curl -X POST localhost:42000/search -H 'Content-Type: application/json' \
+  -d '{"query":"what did I write about networking?"}'
+
+# Force vault reindex
+curl -X POST localhost:42000/reindex
+
+# CI validation (run locally before push)
+docker compose config --quiet
+shellcheck -s bash scripts/*.sh install.sh
+
+# GPU pre-flight check
+./scripts/check-arc-gpu.sh
+```
+
+## Architecture
+
+All traffic flows through **Olla** (port 40114) as the unified LLM router:
+
+```
+OpenCode (CLI + Obsidian plugin)
+  ├── tool: retriever :42000  →  sqlite-vec + FTS5 hybrid search over vault
+  ├── provider: Olla :40115   →  Smart Router (auto-selects local model)
+  │                           →  ollama-arc :11434 (Intel Arc iGPU)
+  │                           →  OLLAMA_REMOTE_* nodes (LAN, optional)
+  └── provider: LiteLLM :4000 →  Claude (Anthropic), Gemini (Google)
+```
+
+### Service responsibilities
+
+| Directory/File | Purpose |
+|---|---|
+| `docker-compose.yml` | Core stack: ollama-arc, litellm, olla, router, retriever |
+| `install.sh` | Preflight → create volumes → install systemd → start stack → pull models (prompts for Bitwarden setup) |
+| `retriever/` | Obsidian vault RAG: FastAPI + sqlite-vec + watchdog. Hybrid search via FTS5 + vector embeddings. |
+| `scripts/generate-olla-config.sh` | Reads `OLLAMA_REMOTE_*` from `.env` → writes `proxy/olla.yaml` |
+| `scripts/discover-herd.sh` | mDNS + subnet scan for other Ollama nodes on LAN |
+| `scripts/check-arc-gpu.sh` | GPU pre-flight: detects card0/card1 drift, updates `.env`, used as `ExecStartPre` |
+| `scripts/resolve-vaultwarden.sh` | Resolves `<vaultwarden:path>` placeholders in `.env` via `bw` CLI |
+| `router/` | Smart Model Router: content-based model selection between OpenCode and Olla |
+| `proxy/litellm_config.yaml` | Static LiteLLM model registry (Claude, Gemini models) |
+
+## Key conventions
+
+- **`.env` is the single source of truth** for LAN node addresses, GPU paths, API keys, and ports. Scripts parse `OLLAMA_REMOTE_*` vars — do not hardcode IPs in compose or scripts.
+- **`proxy/olla.yaml` is auto-generated** — never edit directly. Regenerate via `scripts/generate-olla-config.sh`. It's in `.gitignore`.
+- **GPU card node drifts** (`card0` vs `card1`) on Meteor Lake reboots. `renderD128` is stable. `check-arc-gpu.sh` detects and corrects via systemd `ExecStartPre`.
+- **Retriever volume** is managed by compose (`retriever-data`). Contains the sqlite-vec database with embeddings and FTS5 index.
+- **`OLLAMA_KEEP_ALIVE=-1`** keeps models resident in shared system RAM (Intel iGPU uses shared memory, not dedicated VRAM).
+- **VaultWarden integration is optional** — `install.sh` prompts to set it up. If declined, `.env` stores API keys in plaintext (standard practice for local-only stacks).
+- **Never commit secrets** — `.env` is in `.gitignore`. The installer generates `LITELLM_MASTER_KEY` and stores it in Bitwarden automatically. Anthropic/Gemini keys are stored as `<vaultwarden:...>` placeholders and resolved at runtime.
+
+## RAG (retriever)
+
+The retriever service replaces Khoj + PostgreSQL with a lightweight, API-only service:
+
+- **Vector store**: sqlite-vec (embedded SQLite extension, file-based, no separate DB)
+- **Keyword search**: SQLite FTS5 (BM25 scoring)
+- **Hybrid search**: Reciprocal Rank Fusion (RRF) combining vector + keyword results
+- **Embeddings**: `nomic-embed-text` via Olla → ollama-arc
+- **Indexing**: Full scan on startup, then watchdog (inotify) for live changes
+- **API**: `POST /search`, `POST /reindex`, `GET /health`
+
+Configuration via `.env`:
+```
+RETRIEVER_PORT=42000
+RETRIEVER_VAULT_PATH=/home/user/obsidian
+RETRIEVER_EMBED_MODEL=nomic-embed-text
+RETRIEVER_CHUNK_SIZE=512
+RETRIEVER_CHUNK_OVERLAP=64
+```
+
+## Testing
+
+- `pytest.ini` sets `testpaths = tests` and `asyncio_mode = auto`
+- Python test deps: `requirements-dev.txt` (pytest, pytest-asyncio, httpx, pydantic)
+- CI runs: `docker compose config --quiet`, shellcheck, `.env.example` credential scan
+
+## CI / Release
+
+- **CI** (`.github/workflows/ci.yml`): validates compose syntax, shellchecks scripts, scans `.env.example` for leaked credentials
+- **Release** (`.github/workflows/release.yml`): triggered on `v*.*.*` tags, extracts notes from `CHANGELOG.md`
+- Branch: `main`
+
+## OpenCode tools
+
+The project includes two custom tools for Obsidian vault search:
+
+- **`vault-search`** — search the entire vault for notes matching a query
+- **`vault-search_per_source`** — search within a specific file or subdirectory
+
+Both tools call the retriever service (`:42000`) and return file paths, content snippets, and scores. Use them when asked to find information in notes.
+
+## Gotchas
+
+- LiteLLM healthcheck URL is `/health/liveness` (not "liveliness")
+- `install.sh` model pull is interactive (prompts y/N) — non-headless
+- Retriever depends on Olla for embeddings — ensure Olla is healthy before retriever starts
+- Vault is mounted read-only (`:ro`) — retriever never modifies notes
+- `discover-herd.sh` requires `avahi-daemon` running on the host for mDNS discovery
+- `bw login --apikey` is incompatible with self-hosted VaultWarden (no `userDecryptionOptions` in response). Use interactive `bw login` or an existing unlocked session for `resolve-vaultwarden.sh`.
+- `install.sh` auto-generates the `LITELLM_MASTER_KEY`, creates a `litellm-master-key` item in your Bitwarden vault, and writes a `<vaultwarden:...>` placeholder to `.env`. For `anthropic-api-key` and `gemini-api-key`, you must create those items manually via the Bitwarden web vault.
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d39c7de..87825bd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 ### Added
+- Bitwarden/VaultWarden optional secret management
+  - `install.sh` prompts to configure during setup; installs `bw` CLI, collects API credentials, writes `BW_CLIENT_ID`/`BW_CLIENT_SECRET` to `.env`
+  - `install.sh` detects existing unlocked `bw` session, supports self-hosted VaultWarden URL, auto-generates `LITELLM_MASTER_KEY` and stores it in Bitwarden
+  - `scripts/resolve-vaultwarden.sh` rewritten: authenticates via API key or existing session, resolves `<vaultwarden:org-id/item>` placeholders in `.env`
+  - `start.sh` auto-runs `resolve-vaultwarden.sh` before starting the stack
+  - `.env.example` documents the vaultwarden placeholder format
+  - AGENTS.md updated with resolve commands and table entry
 - GitHub Actions CI workflow (docker-compose validation, shellcheck)
 - GitHub Actions Release workflow (auto-release on version tags)
 - Pull Request template
@@ -18,6 +25,30 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - SECURITY.md policy
 - CODE_OF_CONDUCT.md (Contributor Covenant v2.1)
 - CHANGELOG.md
+- **Retriever service** — lightweight Obsidian vault RAG replacing Khoj + PostgreSQL
+  - FastAPI + sqlite-vec (file-based vector store, no separate DB)
+  - Hybrid search: FTS5 keyword (BM25) + vector similarity, fused via RRF
+  - Watchdog live vault indexing (inotify)
+  - API-only: `POST /search`, `POST /reindex`, `GET /health`
+  - Embedded via Olla → ollama-arc (nomic-embed-text)
+- **discover-herd.sh** — mDNS + subnet scan for auto-discovery of remote Ollama nodes
+- **PLANS.md** — design document for stack simplification
+
+### Removed
+- **Open WebUI** — replaced by OpenCode (CLI + Obsidian sidebar plugin)
+- **Pipelines** — no longer needed (smart_model_router removed)
+- **Open Terminal** — no longer needed
+- **Khoj / khoj-db** — replaced by retriever service (sqlite-vec, no PostgreSQL)
+- **post-install.sh** — entirely targeted Open WebUI API
+- **tools/system_diagnostics.py** — Open WebUI tool protocol
+- **khoj-sync/** — never implemented CouchDB sync
+- All associated environment variables (WEBUI_*, PIPELINES_*, KHOJ_*, COUCHDB_*, etc.)
+
+### Changed
+- Stack reduced from 8 services to 4 (ollama-arc, litellm, olla, retriever)
+- `.env.example` slimmed down to only active configuration
+- `install.sh` updated: no pipelines deployment, no Open WebUI volume, updated completion output
+- `AGENTS.md` reflects new architecture
 
 ## [0.1.0] - 2026-04-30
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 515751e..b1894c8 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -31,10 +31,13 @@ Thank you for your interest in contributing! This project is built from real hom
    docker compose config
 
    # Lint shell scripts
-   shellcheck scripts/*.sh install.sh post-install.sh
+    shellcheck scripts/*.sh install.sh
    ```
 
-4. **Never commit real credentials.** Use placeholder values like `changeme` in examples.
+4. **Never commit real credentials.**
+   - Use `<vaultwarden:org/item>` placeholders for API keys if Bitwarden is configured
+   - Never commit a resolved `.env` file (only placeholder values should appear)
+   - Use placeholder values like `changeme` in examples
 
 5. **Open a pull request** against `main` and fill in the PR template.
 
@@ -53,7 +56,7 @@ ci: update shellcheck action to v2
 
 - Shell scripts: POSIX-compatible where possible; always pass `shellcheck`
 - Docker Compose: keep services alphabetically ordered within logical groups
-- Python (pipelines / tools): follow PEP 8; add a module-level docstring
+- Python (retriever): follow PEP 8
 
 ## Reporting security issues
 
diff --git a/PLANS.md b/PLANS.md
new file mode 100644
index 0000000..b7f13c7
--- /dev/null
+++ b/PLANS.md
@@ -0,0 +1,207 @@
+# PLANS — ai-stack simplification and retriever service
+
+## Status: Approved for implementation
+
+---
+
+## Summary
+
+Remove Open WebUI, Pipelines, Open Terminal, and Khoj from the stack.
+Replace Khoj's Obsidian RAG with a lightweight, API-only retrieval service.
+Add an mDNS-based herd discovery script for other Ollama nodes on the LAN.
+OpenCode becomes the primary AI interface (CLI + embedded in Obsidian via plugin).
+
+---
+
+## Motivation
+
+- OpenCode provides a better chat/assistant experience than Open WebUI
+- OpenCode is embedded directly in Obsidian via a custom plugin
+- Khoj + PostgreSQL is heavy for a single-user RAG backend
+- No need for multi-service infrastructure (pipelines, terminal, web UI) when OpenCode handles everything
+
+---
+
+## Architecture after changes
+
+```
+Notice: Server-Sent Events (may require SSE to work)
+
+OpenCode (CLI + Obsidian sidebar plugin)
+  |
+  |--- tool: retriever :42000 (vault RAG)
+  |       FastAPI + sqlite-vec + watchdog
+  |       hybrid search: BM25 (FTS5) + vector (sqlite-vec)
+  |       embeds via Olla -> ollama-arc (nomic-embed-text)
+  |       vault mounted :ro at /vault
+  |
+  |--- provider: Olla :40114 (unified LLM router)
+  |       |--- ollama-arc :11434 (Intel Arc iGPU, local)
+  |       |--- litellm :4000 (Claude, Gemini, cloud)
+  |       |--- OLLAMA_REMOTE_* nodes (LAN, optional)
+  |
+  |--- provider: litellm :4000 (direct, optional)
+
+discoverer (systemd timer): mDNS scan -> updates Olla config + OpenCode providers
+```
+
+---
+
+## Services to remove
+
+| Service | Reason | Impact |
+|---|---|---|
+| `open-webui` | Replaced by OpenCode | Lose chat history volume, admin panel |
+| `pipelines` | Function pipelines are OpenWebUI-only | smart_model_router.py no longer needed |
+| `open-terminal` | Browser terminal, only integrated in WebUI | No loss |
+| `khoj` | Replaced by retriever service | Lose Khoj web UI, Obsidian plugin, PostgreSQL |
+| `khoj-db` | Replaced by sqlite-vec (file-based) | Volume deleted |
+
+## Services to keep
+
+| Service | Reason |
+|---|---|
+| `ollama-arc` | Local GPU inference for all services |
+| `litellm` | Cloud API gateway for Claude, Gemini |
+| `olla` | Unified router — retriever + OpenCode both point here |
+
+## New: retriever service
+
+### Design
+
+- **Container**: `retriever` in docker-compose.yml
+- **Base image**: `python:3.12-slim`
+- **Framework**: FastAPI
+- **Vector store**: sqlite-vec (SQLite extension for vector search)
+- **Keyword search**: SQLite FTS5
+- **File watching**: watchdog (inotify)
+- **Embedding**: POST to Olla -> ollama-arc (nomic-embed-text, 768-dim)
+- **Port**: 42000
+
+### API
+
+```
+POST /search
+  {"query": "what did I write about networking?", "top_k": 10}
+  -> {"results": [{"path": "...", "content": "...", "score": 0.92}, ...]}
+
+POST /index
+  {"paths": ["note1.md", "subdir/note2.md"]}
+  -> {"indexed": 2, "skipped": 5}
+
+GET /health
+  -> {"status": "ok", "indexed_files": 1240, "vault_watching": true}
+```
+
+### Indexing strategy
+
+- On startup: full scan of `/vault` (mounted :ro from host)
+- Incremental: watchdog watches for file create/modify/delete events
+- Chunking: markdown-aware splitter (respects `#` headings, `---` thematic breaks)
+- Hybrid search: vector similarity (cosine) + keyword relevance (BM25 via FTS5)
+- Result fusion: reciprocal rank fusion (RRF) to combine both scores
+
+## New: discoverer
+
+### Design
+
+- **Script**: `scripts/discover-herd.sh` (bash)
+- **Trigger**: systemd timer or cron (every 5 min)
+- **Mechanism**: `avahi-browse` for `_ollama._tcp` mDNS service type
+- **Output**: Writes discovered nodes to:
+  - `proxy/olla.yaml` (via `generate-olla-config.sh`)
+  - OpenCode provider config (`.opencode/providers.yaml`)
+- **Fallback**: also scan common ports (11434, 11435) on local subnet
+
+---
+
+## File changes
+
+### New files
+
+| File | Purpose |
+|---|---|
+| `retriever/Dockerfile` | Multi-stage Python build |
+| `retriever/main.py` | FastAPI app, route handlers |
+| `retriever/indexer.py` | Vault scanner, watchdog, chunking |
+| `retriever/search.py` | Hybrid search, FTS5 + sqlite-vec, RRF fusion |
+| `retriever/requirements.txt` | fastapi, uvicorn, watchdog, sqlite-vec, httpx |
+| `scripts/discover-herd.sh` | mDNS discovery + OpenCode/Olla config gen |
+
+### Modified files
+
+| File | Change |
+|---|---|
+| `docker-compose.yml` | Remove open-webui, pipelines, open-terminal, khoj, khoj-db. Add retriever. Drop external volumes. |
+| `.env.example` | Strip WEBUI_*, PIPELINES_*, OPEN_TERMINAL_*, KHOJ_*, COUCHDB_*. Add RETRIEVER_*. |
+| `.env` | Match .env.example changes |
+| `AGENTS.md` | Reflect new architecture and developer commands |
+| `start.sh` | Remove post-install hint; add retriever health check |
+
+### Deleted files
+
+| File | Reason |
+|---|---|
+| `post-install.sh` | Entirely targets Open WebUI API |
+| `pipelines/` | Open WebUI-specific |
+| `tools/` | Open WebUI-specific |
+| `khoj-sync/` | Empty, never implemented |
+| `docs/post-install.md` | Describes WebUI admin panel steps |
+
+---
+
+## Environment variables
+
+### Removed
+
+```
+WEBUI_PORT, WEBUI_NAME, WEBUI_SECRET_KEY
+PIPELINES_PORT, PIPELINES_API_KEY
+OPEN_TERMINAL_PORT, OPEN_TERMINAL_API_KEY
+KHOJ_PORT, KHOJ_ADMIN_EMAIL, KHOJ_ADMIN_PASSWORD
+KHOJ_DJANGO_SECRET_KEY, KHOJ_DB_PASSWORD
+KHOJ_NO_AUTH, OBSIDIAN_VAULT_PATH
+COUCHDB_URL, COUCHDB_DB, COUCHDB_USER, COUCHDB_PASSWORD
+KHOJ_SYNC_SKIP_INITIAL, KHOJ_SYNC_LOG_LEVEL
+```
+
+### Added
+
+```
+RETRIEVER_PORT=42000
+RETRIEVER_VAULT_PATH=/home/netyeti/obsidian
+RETRIEVER_EMBED_MODEL=nomic-embed-text
+RETRIEVER_CHUNK_SIZE=512
+RETRIEVER_CHUNK_OVERLAP=64
+```
+
+### Kept (unchanged)
+
+```
+STACK_USER, OLLAMA_DATA, GPU_CARD, GPU_RENDER
+OLLAMA_PORT, LITELLM_PORT, LITELLM_MASTER_KEY
+OLLA_PORT, ANTHROPIC_API_KEY, GEMINI_API_KEY
+OLLAMA_REMOTE_*, MODELS_TO_PULL
+OLLA_ENGINE, OLLA_LOAD_BALANCER, OLLA_REQUEST_LOGGING
+```
+
+---
+
+## Developer commands (updated)
+
+```bash
+docker compose up -d                    # start: ollama-arc, litellm, olla, retriever
+docker compose logs -f retriever        # watch vault indexing
+curl localhost:42000/health             # check retriever status
+bash scripts/discover-herd.sh           # manual discover + config
+```
+
+---
+
+## Rollback
+
+If the new setup doesn't work:
+1. `git checkout` the deleted files from git
+2. Revert docker-compose.yml to previous version
+3. Recreate the `open-webui` external volume
+4. Re-pull previous `.env`
diff --git a/README.md b/README.md
index 799f8e0..d26cb7b 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # ai-stack
 
-A self-hosted AI stack optimised for **Intel Arc iGPU** on Linux, built around Ollama + Open WebUI with automated model routing, system diagnostics tools, and a systemd-managed Docker Compose stack.
+A self-hosted AI stack optimised for **Intel Arc iGPU** on Linux, built around Ollama + OpenCode. Provides local LLM inference (ollama-arc), cloud API routing (LiteLLM), unified routing (Olla), and Obsidian vault RAG (retriever). The primary AI interface is **OpenCode** (CLI + Obsidian sidebar plugin).
 
 Built and documented through real-world homelab experience on Intel Arc hardware.
 
@@ -11,11 +11,11 @@ Built and documented through real-world homelab experience on Intel Arc hardware
 | Component | Purpose |
 |-----------|---------|
 | **Ollama (ava-agentone/ollama-intel)** | LLM inference with Intel Arc iGPU acceleration via OneAPI/SYCL |
-| **Open WebUI** | Chat interface with tool calling, pipelines, and terminal access |
-| **Pipelines** | Server-side plugin system for model routing and workflow automation |
-| **Open Terminal** | Browser-based terminal inside Open WebUI (with sudo support) |
-| **Smart Model Router** | Auto-routes queries to the best model based on content |
-| **System Diagnostics** | Tool for querying Ollama health, models, and VRAM across multiple machines |
+| **LiteLLM** | Cloud API gateway (Claude, Gemini) |
+| **Olla** | Unified LLM router with load balancing |
+| **Smart Router** | Content-based model selection (OpenCode → router → Olla) |
+| **Retriever** | Lightweight Obsidian vault RAG (sqlite-vec + FTS5, hybrid search) |
+| **OpenCode** | Primary AI interface — CLI tool + Obsidian sidebar plugin |
 
 ---
 
@@ -29,9 +29,6 @@ Built and documented through real-world homelab experience on Intel Arc hardware
 | Storage | 50 GB free | 100 GB+ free (models are large) |
 | OS | Ubuntu 22.04 | Ubuntu 24.04 |
 
-> **Note:** This stack uses `ghcr.io/ava-agentone/ollama-intel` which replaced the archived `intelanalytics/ipex-llm-inference-cpp-xpu` image (archived January 28, 2026).
-
-> **Note:** This stack has been specifically developed and tested on an Asus Zenbook Duo with an Intel Arc iGPU (Meteor Lake) running Ubuntu 24.04LTS. Other Intel scenarios should work, but have not been specifically tested - yet.  Please feel free to offer some patches or help us to add support for your system & environment.
 ---
 
 ## Quick start
@@ -43,18 +40,20 @@ cd ai-stack
 
 # 2. Configure
 cp .env.example .env
-nano .env   # set your username, paths, and API keys
+nano .env   # set your username, paths, and API keys (or skip — install.sh can set up Bitwarden)
 
 # 3. Install
 chmod +x install.sh scripts/check-arc-gpu.sh
 ./install.sh
 
-# 4. Open
-# http://localhost:3000
+# The installer will:
+#   - Prompt to install OpenCode CLI + Bun
+#   - Auto-install the OpenCode Obsidian plugin (growlf/opencode-obsidian)
+#   - Prompt to configure Bitwarden/VaultWarden for secret management
+#   - Start the stack (ollama-arc, litellm, olla, router, retriever)
+#   - Prompt to pull models
 ```
 
-Then follow **[docs/post-install.md](docs/post-install.md)** for the Open WebUI configuration steps.
-
 ---
 
 ## Project structure
@@ -67,15 +66,29 @@ ai-stack/
 ├── systemd/
 │   └── ai-stack.service        # Systemd unit (auto-start on boot)
 ├── scripts/
-│   └── check-arc-gpu.sh        # GPU pre-flight (detects card0/card1 drift)
-├── pipelines/
-│   └── smart_model_router.py   # Auto-routes queries to best model
-├── tools/
-│   └── system_diagnostics.py   # Multi-instance Ollama health + model queries
+│   ├── check-arc-gpu.sh        # GPU pre-flight (detects card0/card1 drift)
+│   ├── discover-herd.sh        # mDNS discovery of remote Ollama nodes
+│   ├── generate-olla-config.sh # Reads .env → writes proxy/olla.yaml
+│   ├── resolve-vaultwarden.sh  # Resolves <vaultwarden:...> placeholders via bw CLI
+│   └── generate-keys.sh        # Generates secure keys (LITELLM_MASTER_KEY)
+├── router/
+│   └── smart-model-router.py   # Content-based model routing (OpenCode → router → Olla)
+├── retriever/
+│   ├── main.py                 # FastAPI app
+│   ├── search.py               # Hybrid search (FTS5 + vector, RRF fusion)
+│   ├── indexer.py              # Vault scanner + watchdog + chunking
+│   └── Dockerfile
+`scripts/olla.yaml.template`    # Olla config template (generated by generate-olla-config.sh)
+├── proxy/
+│   └── litellm_config.yaml     # LiteLLM model registry (Claude, Gemini)
+├── .opencode/
+│   └── tools/
+│       └── vault-search.ts     # OpenCode tool: search vault via retriever API
 └── docs/
-    ├── post-install.md          # Open WebUI configuration checklist
-    ├── model-guide.md           # Model recommendations and routing table
-    └── troubleshooting.md       # Common issues and fixes
+    ├── deployment-guide.md     # Setup walkthrough
+    ├── model-guide.md          # Model recommendations and routing
+    ├── troubleshooting.md      # Common issues and fixes
+    └── retriever-guide.md      # Obsidian vault RAG setup
 ```
 
 ---
@@ -87,10 +100,10 @@ ai-stack/
 | `gemma4:27b` | Heavy lifting, large context, complex analysis |
 | `mistral-small3.2:24b` | Strong function calling, 128K context |
 | `qwen3.5:14b` | Improved reasoning, tool calling (recommended default) |
-| `qwen2.5:14b` | Tool calling, diagnostics, sysadmin |
-| `qwen2.5-coder:14b` | Scripts, configs, code |
-| `deepseek-r1:14b` | Complex reasoning, root cause analysis |
-| `gemma3:12b` | Log analysis, summaries, documentation |
+| `qwen2.5:14b` | Tool calling, diagnostics, sysadmin (router default for diagnostics) |
+| `qwen2.5-coder:14b` | Scripts, configs, code (router default for code) |
+| `deepseek-r1:14b` | Complex reasoning, root cause analysis (router default for reasoning) |
+| `gemma3:12b` | Log analysis, summaries, documentation (router default for longform) |
 | `nomic-embed-text` | Embeddings / RAG |
 
 See **[docs/model-guide.md](docs/model-guide.md)** for details.
@@ -108,13 +121,48 @@ See **[docs/model-guide.md](docs/model-guide.md)** for details.
 
 ## Multi-machine setup
 
-The System Diagnostics tool supports querying multiple Ollama instances across your LAN. Edit `OLLAMA_INSTANCES` in `tools/system_diagnostics.py`:
+Add remote Ollama nodes via `.env`:
+
+```
+OLLAMA_REMOTE_WORKSTATION=http://192.168.1.50:11434:75
+```
+
+Then regenerate Olla config:
+
+```bash
+bash scripts/generate-olla-config.sh
+sudo systemctl restart ai-stack.service
+```
+
+Or auto-discover nodes on your LAN:
+
+```bash
+bash scripts/discover-herd.sh --apply
+```
+
+---
+
+## Secret management (optional)
+
+The stack can resolve API keys from Bitwarden (or self-hosted VaultWarden) at runtime using `<vaultwarden:org-id/item-name>` placeholders in `.env`:
+
+```
+ANTHROPIC_API_KEY=<vaultwarden:abc123-xyz/anthropic-api-key>
+```
+
+The `install.sh` script prompts to set this up:
+- Installs `bw` CLI (via npm)
+- Collects your organization ID and API credentials
+- Writes `BW_CLIENT_ID`, `BW_CLIENT_SECRET`, `VAULT_MASTER_PASSWORD` to `.env`
+- Creates vaultwarden placeholders for `ANTHROPIC_API_KEY`, `GEMINI_API_KEY`, and `LITELLM_MASTER_KEY`
+- Resolves them immediately
+
+On subsequent starts, `start.sh` auto-resolves any unresolved placeholders.
 
-```python
-OLLAMA_INSTANCES = {
-    "local":   "http://ollama-arc:11434",   # this machine
-    "remote1": "http://10.0.0.X:11434",     # remote machine on your LAN
-}
+Manual resolution:
+```bash
+./scripts/resolve-vaultwarden.sh              # resolve in-place
+./scripts/resolve-vaultwarden.sh --dry-run    # preview only
 ```
 
 ---
@@ -133,14 +181,6 @@ sudo systemctl restart ai-stack.service
 
 ---
 
-## Related projects
-
-- [ava-agentone/ollama-intel](https://github.com/Ava-AgentOne/ollama-intel) — Intel Arc optimised Ollama image
-- [open-webui/open-webui](https://github.com/open-webui/open-webui) — Web interface
-- [open-webui/pipelines](https://github.com/open-webui/pipelines) — Pipeline plugin system
-
----
-
 ## Licence
 
 MIT — use freely, contributions welcome.
diff --git a/SECURITY.md b/SECURITY.md
index 4b9d2ac..7152dfe 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -27,7 +27,9 @@ You can expect an acknowledgement within **72 hours** and a resolution timeline
 
 - **Never commit real credentials.** Use `.env` (which is git-ignored) for secrets; `.env.example` must only contain placeholder values.
 - **Never commit backup files containing secrets.** Files like `.env.backup`, `.env.backup*`, or timestamped backups (e.g., `.env.example.backup-20260503-014323`) must never be committed. Always add backup file patterns to `.gitignore`.
-- **Docker socket access.** `open-webui` mounts `/var/run/docker.sock`. Restrict access to this stack to trusted users only.
+
+- **Use Bitwarden/VaultWarden for secret management (recommended).** The `install.sh` script prompts to set it up. API keys are stored as `<vaultwarden:org/item>` placeholders in `.env` and resolved at runtime via `resolve-vaultwarden.sh`. This avoids storing secrets in plaintext.
+
 - **Network exposure.** By default, services bind to all interfaces. In production, put a reverse proxy (e.g. nginx, Caddy) with TLS in front and restrict direct port access.
 - **Default passwords.** Change all `changeme` defaults in your `.env` before exposing any service to a network.
 
diff --git a/docker-compose.yml b/docker-compose.yml
index 7edc48a..896e738 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,17 +1,15 @@
 ###############################################################################
-# AI Stack — with Intel Arc iGPU + cloud models + LiteLLM, unified in Open WebUI
+# AI Stack — with Intel Arc iGPU + cloud models + LiteLLM, unified via Olla router
 #
 # Topology:
 #
-#   Open WebUI  ──[Ollama endpoint]──▶  Olla :40114  ──▶  ollama-arc :11434  (Intel Arc iGPU)
-#               ──[OpenAI endpoint]─┐                 ──▶  [remote-node] :11434  (LAN, optional)
-#                                   └──────────────────▶  LiteLLM :4000
-#                                                               ├─ Claude  (Anthropic)
-#                                                               └─ Gemini  (Google)
+#   Retriever  ──[embed]──▶  Olla :40114  ──▶  ollama-arc :11434  (Intel Arc iGPU)
+#   OpenCode   ──[tool / provider]──▶  Olla :40114
+#               ──[provider]───────▶  LiteLLM :4000  ──▶  Claude (Anthropic)
+#                                                       ──▶  Gemini (Google)
 #
-#   Khoj  ──▶  Olla :40114/olla/ollama  (RAG + semantic search over Obsidian vault)
-#   Pipelines ──▶  Open WebUI  (function pipelines)
-#   Open Terminal ──▶  Open WebUI  (browser shell)
+#   All inference through Olla for unified routing + load balancing.
+#   OpenCode is the primary AI interface (CLI + Obsidian sidebar plugin).
 #
 # Quick start:
 #   1. cp .env.example .env && nano .env
@@ -19,12 +17,9 @@
 #   3. docker compose up -d
 #
 # Service URLs:
-#   Open WebUI    http://localhost:3000
-#   LiteLLM UI    http://localhost:4000/ui      (admin / LITELLM_MASTER_KEY)
 #   Olla status   http://localhost:40114/internal/status/endpoints
-#   Khoj          http://localhost:42110
-#   Pipelines     http://localhost:9099
-#   Open Terminal http://localhost:8000
+#   Retriever     http://localhost:42000/health
+#   LiteLLM UI    http://localhost:4000/ui      (admin / LITELLM_MASTER_KEY)
 ###############################################################################
 
 services:
@@ -116,151 +111,57 @@ services:
     networks:
       - ai-net
 
-  # ─── Open WebUI ────────────────────────────────────────────────────────────
-  # Ollama endpoint → Olla (local models, load balanced)
-  # OpenAI endpoint → LiteLLM (Claude, Gemini — all appear in model picker)
-  open-webui:
-    image: ghcr.io/open-webui/open-webui:main
-    container_name: open-webui
-    restart: always
-    ports:
-      - "${WEBUI_PORT:-3000}:8080"
-    environment:
-      # Local models via Olla (replaces direct ollama-arc reference)
-      - OLLAMA_BASE_URL=http://olla:40114/olla/ollama
-      # Cloud models via LiteLLM
-      - OPENAI_API_BASE_URL=http://litellm:4000/v1
-      - OPENAI_API_KEY=${LITELLM_MASTER_KEY:-sk-local-admin}
-      # Open Terminal integration
-      - OPEN_TERMINAL_URL=http://open-terminal:8000
-      - OPEN_TERMINAL_API_KEY=${OPEN_TERMINAL_API_KEY:-changeme}
-      # Pipelines integration
-      - PIPELINES_URLS=http://pipelines:9099
-      - PIPELINES_API_KEY=${PIPELINES_API_KEY:-changeme}
-      # UI settings
-      - WEBUI_NAME=${WEBUI_NAME:-Yeti's AI Stack}
-      - WEBUI_SECRET_KEY=${WEBUI_SECRET_KEY:?Set WEBUI_SECRET_KEY to a strong random value}
-      - DEFAULT_USER_ROLE=user
-      - ENABLE_SIGNUP=${ENABLE_SIGNUP:-false}
-    volumes:
-      - open-webui:/app/backend/data
-      - /var/run/docker.sock:/var/run/docker.sock
-    depends_on:
-      - olla
-      - litellm
-      - pipelines
-    networks:
-      - ai-net
-
-  # ─── Open Terminal ─────────────────────────────────────────────────────────
-  open-terminal:
-    image: ghcr.io/open-webui/open-terminal:latest
-    container_name: open-terminal
+  # ─── Retriever — Obsidian vault RAG ────────────────────────────────────────
+  # Lightweight replacement for Khoj + PostgreSQL.
+  # Uses sqlite-vec for vector storage, FTS5 for keyword search,
+  # and watchdog for live vault indexing.
+  # Hybrid search: BM25 (FTS5) + vector similarity, fused via RRF.
+  # Embeddings via Olla → ollama-arc (nomic-embed-text).
+  # API-only: no web UI. Intended for use as an OpenCode tool.
+  retriever:
+    build: ./retriever
+    container_name: retriever
     restart: unless-stopped
     ports:
-      - "${TERMINAL_PORT:-8000}:8000"
-    environment:
-      - OPEN_TERMINAL_API_KEY=${OPEN_TERMINAL_API_KEY:-changeme}
-      - TERMINAL_SUDO_ENABLED=true
-      - TERMINAL_USER=${STACK_USER:-user}
+      - "${RETRIEVER_PORT:-42000}:42000"
     volumes:
-      - open-terminal-data:/home/user
-    networks:
-      - ai-net
-
-  # ─── Pipelines ─────────────────────────────────────────────────────────────
-  pipelines:
-    image: ghcr.io/open-webui/pipelines:main
-    container_name: pipelines
-    restart: always
-    ports:
-      - "${PIPELINES_PORT:-9099}:9099"
+      - retriever-data:/data
+      - ${RETRIEVER_VAULT_PATH:-/home/user/obsidian}:/vault:ro
     environment:
-      - PIPELINES_API_KEY=${PIPELINES_API_KEY:-changeme}
-    volumes:
-      - pipelines:/app/pipelines
-    networks:
-      - ai-net
-
-  # ─── Khoj — AI second brain + Obsidian RAG ─────────────────────────────────
-  # Indexes your Obsidian vault for semantic search + RAG over local notes.
-  # Routes inference through Olla so it benefits from load balancing/failover.
-  # Obsidian plugin setup: see docs/khoj-setup.md
-  khoj-db:
-    image: ankane/pgvector:latest
-    container_name: khoj-db
-    restart: unless-stopped
-    environment:
-      POSTGRES_USER: khoj
-      POSTGRES_PASSWORD: ${KHOJ_DB_PASSWORD:?Set KHOJ_DB_PASSWORD in your .env file}
-      POSTGRES_DB: khoj
-    volumes:
-      - khoj-db:/var/lib/postgresql/data
+      - OLLA_URL=http://olla:40114
+      - EMBED_MODEL=${RETRIEVER_EMBED_MODEL:-nomic-embed-text}
+      - CHUNK_SIZE=${RETRIEVER_CHUNK_SIZE:-512}
+      - CHUNK_OVERLAP=${RETRIEVER_CHUNK_OVERLAP:-64}
+      - DB_PATH=/data/retriever.db
+      - VAULT_PATH=/vault
+    depends_on:
+      - olla
     networks:
       - ai-net
 
-  khoj:
-    image: ghcr.io/khoj-ai/khoj:latest
-    container_name: khoj
+  # ── Smart Model Router — content-based model selection ──────────────────
+  # Sits between OpenCode and Olla. Classifies queries and auto-selects the
+  # best local model. Cloud model requests pass through unchanged.
+  # Point OpenCode's Olla provider at http://localhost:40115.
+  router:
+    build: ./router
+    container_name: router
     restart: unless-stopped
     ports:
-      - "127.0.0.1:${KHOJ_PORT:-42110}:42110"
+      - "40115:40115"
     environment:
-      - KHOJ_ADMIN_EMAIL=${KHOJ_ADMIN_EMAIL:-admin@localhost}
-      - KHOJ_ADMIN_PASSWORD=${KHOJ_ADMIN_PASSWORD:-changeme}
-      - KHOJ_DJANGO_SECRET_KEY=${KHOJ_DJANGO_SECRET_KEY:-changeme-secret-key}
-      - POSTGRES_USER=khoj
-      - POSTGRES_PASSWORD=${KHOJ_DB_PASSWORD:?Set KHOJ_DB_PASSWORD in your .env file}
-      - POSTGRES_DB=khoj
-      - POSTGRES_HOST=khoj-db
-      - POSTGRES_PORT=5432
-      # Route inference through Olla instead of directly to ollama-arc.
-      # Khoj now benefits from load balancing + failover automatically.
-      - OPENAI_BASE_URL=http://olla:40114/olla/ollama/v1/
-      # nomic-embed-text is already in the recommended model stack
-      - EMBEDDING_MODEL=nomic-embed-text
-      # Set KHOJ_NO_AUTH=true in .env only if you want unauthenticated local access.
-      # Default is false (auth required). Do not enable in production without a reverse proxy.
-      - KHOJ_NO_AUTH=${KHOJ_NO_AUTH:-false}
-    command: --host="0.0.0.0" --port=42110 --non-interactive
-    volumes:
-      # Mount your Obsidian vault read-only for indexing
-      - ${OBSIDIAN_VAULT_PATH:-/home/user/obsidian}:/vault:ro
+      - OLLA_URL=http://olla:40114
+      - LISTEN_HOST=0.0.0.0
+      - LISTEN_PORT=40115
     depends_on:
-      - khoj-db
       - olla
     networks:
       - ai-net
 
-  # ─── Khoj Sync (CouchDB → Khoj live indexing) ─────────────────────────────
-  # Watches CouchDB _changes feed and pushes updated notes to Khoj in real-time.
-  # Handles LiveSync's chunked storage format automatically.
-  # khoj-sync:
-  #   build:
-  #     context: ./khoj-sync
-  #   container_name: khoj-sync
-  #   restart: unless-stopped
-  #   environment:
-  #     - COUCHDB_URL=${COUCHDB_URL}
-  #     - COUCHDB_DB=${COUCHDB_DB:-yeti-journal}
-  #     - COUCHDB_USER=${COUCHDB_USER}
-  #     - COUCHDB_PASSWORD=${COUCHDB_PASSWORD}
-  #     - KHOJ_URL=http://khoj:42110
-  #     - LOG_LEVEL=${KHOJ_SYNC_LOG_LEVEL:-INFO}
-  #     - KHOJ_SYNC_SKIP_INITIAL=${KHOJ_SYNC_SKIP_INITIAL:-false}
-  #   networks:
-  #     - ai-net
-  #   depends_on:
-  #     - khoj
-
-# ─── Volumes ──────────────────────────────────────────────────────────────────
+# ── Volumes ──────────────────────────────────────────────────────────────────
 volumes:
-  open-webui:
-    external: true          # pre-existing — preserves your WebUI config/history
-  open-terminal-data:
-  pipelines:
-  khoj-db:
   olla-logs:
+  retriever-data:
 
 # ─── Networks ─────────────────────────────────────────────────────────────────
 networks:
diff --git a/docs/deployment-guide.md b/docs/deployment-guide.md
index 07ce5ca..5edfb09 100644
--- a/docs/deployment-guide.md
+++ b/docs/deployment-guide.md
@@ -19,7 +19,7 @@ A step-by-step guide for setting up the AI Stack on Linux with Intel Arc iGPU.
 ### 1. Clone the repository
 
 ```bash
-git clone https://github.com/yourusername/ai-stack.git
+git clone https://github.com/growlf/ai-stack.git
 cd ai-stack
 ```
 
@@ -34,15 +34,10 @@ Edit `.env` and set at minimum:
 | Variable | What to set |
 |----------|-------------|
 | `STACK_USER` | Your Linux username |
-| `LITELLM_MASTER_KEY` | Change from default — use `sk-local-` + random hex |
-| `ANTHROPIC_API_KEY` | Your Anthropic API key (for Claude) |
-| `GEMINI_API_KEY` | Your Google AI API key (for Gemini) |
-| `WEBUI_SECRET_KEY` | Generate with: `python3 -c "from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())"` |
-
-```bash
-# Generate keys for all services
-bash scripts/generate-keys.sh
-```
+| `LITELLM_MASTER_KEY` | Auto-generated by installer — stored in Bitwarden if configured |
+| `ANTHROPIC_API_KEY` | Your Anthropic API key (for Claude) — store in Bitwarden via `install.sh` |
+| `GEMINI_API_KEY` | Your Google AI API key (for Gemini) — store in Bitwarden via `install.sh` |
+| `RETRIEVER_VAULT_PATH` | Path to your Obsidian vault directory |
 
 ### 3. Run the installer
 
@@ -53,63 +48,58 @@ bash install.sh
 The installer automates:
 - Creating required Docker volumes
 - Installing the systemd service (`ai-stack.service`)
-- Deploying pipeline files to the Pipelines container
+- Installing OpenCode CLI + Bun (optional prompt)
+- Auto-installing the OpenCode Obsidian plugin (`growlf/opencode-obsidian`) into `.obsidian/`
+- Prompting to configure Bitwarden/VaultWarden for secret management
 - Starting the full stack
 - Prompting you to pull models
 
-### 4. Open WebUI
+### 4. Configure OpenCode
 
-Open **http://localhost:3000** and create your admin account (first user becomes admin).
+OpenCode is the primary AI interface. The installer creates a global config at `~/.opencode/config.json` with all providers:
 
-> **Important:** The first account registered is the admin. Register immediately on first visit.
+- **Olla** → Smart Router (`:40115`) — auto-routes local model requests
+- **LiteLLM** (`:4000`) — direct access to Claude, Gemini
 
-### 5. Post-install configuration
+The Olla provider now routes through the Smart Model Router, which classifies your query and selects the best model for the task.
 
-After first login, follow the [post-install guide](post-install.md) to configure:
-- Ollama connection (`http://ollama-arc:11434`)
-- Pipelines connection (`http://pipelines:9099`)
-- Open Terminal integration
-- System Diagnostics tool
-- Smart Model Router
+### 5. Verify the retriever
+
+```bash
+curl localhost:42000/health
+curl -X POST localhost:42000/search -H 'Content-Type: application/json' \
+  -d '{"query":"what did I write about networking?"}'
+```
 
 ---
 
 ## Architecture Overview
 
 ```
-Open WebUI :3000
-  ├── Ollama API → Olla :40114/olla/ollama → ollama-arc :11434 (Intel Arc iGPU)
-  │                                       → remote Ollama nodes (LAN, optional)
-  └── OpenAI API → LiteLLM :4000/v1 → Claude (Anthropic)
-                                     → Gemini (Google)
-
-Khoj :42110 → Olla :40114/olla/ollama/v1/ → ollama-arc (RAG over Obsidian vault)
+OpenCode (CLI + Obsidian plugin)
+  ├── tool: retriever :42000  →  sqlite-vec + FTS5 hybrid search over vault
+  ├── provider: Olla :40114   →  ollama-arc :11434 (Intel Arc iGPU)
+  │                           →  OLLAMA_REMOTE_* nodes (LAN, optional)
+  └── provider: LiteLLM :4000 →  Claude (Anthropic), Gemini (Google)
 ```
 
-All traffic flows through **Olla** (port 40114) as the unified LLM router. This means you only configure one endpoint in your tools.
-
 ### Service Quick Reference
 
 | Service | Port | Purpose |
 |---------|------|---------|
-| Open WebUI | 3000 | Chat UI, admin panel |
+| Smart Router | 40115 | Content-based model routing (auto-selects best local model) |
 | Olla | 40114 | LLM router / load balancer |
 | LiteLLM | 4000 | Cloud model proxy (Claude, Gemini) |
 | Ollama (arc) | 11434 | Local LLM runner (Intel Arc iGPU) |
-| Pipelines | 9099 | Query routing, code execution pipeline |
-| Open Terminal | 8000 | Terminal in the browser |
-| Khoj | 42110 | AI search over your notes |
-| Khoj DB | 5432 | Postgres for Khoj |
+| Retriever | 42000 | Obsidian vault RAG (API-only) |
 
 ---
 
 ## Verifying the Stack
 
-Once the stack is running, verify everything is healthy:
-
 ```bash
 # Olla (LLM router)
-curl http://localhost:40114/health
+curl http://localhost:40114/internal/health
 
 # Ollama (local models)
 curl http://localhost:11434/api/tags
@@ -117,18 +107,11 @@ curl http://localhost:11434/api/tags
 # LiteLLM (cloud gateway)
 curl http://localhost:4000/health/liveness
 
-# Open WebUI
-curl http://localhost:3000/health
-```
-
-Check models are available:
+# Retriever
+curl http://localhost:42000/health
 
-```bash
-# List installed models
+# Check models
 docker exec ollama-arc ollama list
-
-# Check what's loaded in GPU memory
-curl http://localhost:11434/api/ps | python3 -m json.tool
 ```
 
 Verify the GPU is working:
@@ -137,8 +120,6 @@ Verify the GPU is working:
 docker logs ollama-arc 2>&1 | grep -i "device\|gpu\|arc\|oneapi"
 ```
 
-Expected output shows `oneapi` as the inference engine and VRAM > 0.
-
 ---
 
 ## Day 2 Operations
@@ -152,19 +133,6 @@ sudo systemctl restart ai-stack.service
 sudo systemctl status ai-stack.service
 ```
 
-### Direct Docker Compose (for testing)
-
-```bash
-# Start with pre-flight checks
-bash start.sh -d
-
-# Or start directly (no pre-flight)
-docker compose up -d
-
-# Stop
-docker compose down
-```
-
 ### View logs
 
 ```bash
@@ -172,49 +140,34 @@ docker compose down
 docker compose logs --tail=50 -f
 
 # Single service
-docker logs open-webui --tail=30 -f
 docker logs ollama-arc --tail=30 -f
+docker logs retriever --tail=30 -f
 ```
 
 ### Pull new models
 
 ```bash
-docker exec ollama-arc ollama pull deepseek-r1:14b
-docker exec ollama-arc ollama pull gemma4:27b
-docker exec ollama-arc ollama pull mistral-small3.2:24b
-docker exec ollama-arc ollama pull qwen3.5:14b
-docker exec ollama-arc ollama pull qwen2.5-coder:14b
-docker exec ollama-arc ollama pull gemma3:12b
-docker exec ollama-arc ollama pull qwen2.5:14b
 docker exec ollama-arc ollama pull nomic-embed-text:latest
+docker exec ollama-arc ollama pull qwen3.5:14b
 ```
 
 ### Add a remote Ollama node
 
-1. Add to `.env`: `OLLAMA_REMOTE_MYNODE=http://192.168.1.50:11434`
-2. Regenerate Olla config: `bash scripts/generate-olla-config.sh`
-3. Restart: `sudo systemctl restart ai-stack.service`
-
-### Update the stack
-
 ```bash
-git pull
-docker compose pull
+# Manual
+echo "OLLAMA_REMOTE_MYNODE=http://192.168.1.50:11434" >> .env
+bash scripts/generate-olla-config.sh
 sudo systemctl restart ai-stack.service
-```
 
-### Edit configuration and reload
+# Or auto-discover
+bash scripts/discover-herd.sh --apply
+```
 
-If you change `.env` and need to apply:
+### Update the stack
 
 ```bash
-# Regenerate Olla config (reads OLLAMA_REMOTE_* from .env)
-bash scripts/generate-olla-config.sh
-
-# Regenerate pipeline configs
-bash install.sh  # (idempotent — safe to re-run)
-
-# Restart the stack
+git pull
+docker compose pull
 sudo systemctl restart ai-stack.service
 ```
 
@@ -223,8 +176,8 @@ sudo systemctl restart ai-stack.service
 ## Security Basics
 
 - **Never commit `.env`** — it's gitignored, but double-check `git status` before committing
-- **Change all default passwords** — `LITELLM_MASTER_KEY`, `WEBUI_SECRET_KEY`, `PIPELINES_API_KEY`, `KHOJ_ADMIN_PASSWORD` must not be defaults
-- **Backup files** — If you create `.env.backup`, it's gitignored, but verify with `git status`
+- **Use Bitwarden/VaultWarden for secrets** — `install.sh` prompts to set up; resolves API keys from vault at runtime
+- **Change all default passwords** — if not using Bitwarden, change `LITELLM_MASTER_KEY` from default
 - **Network exposure** — All services bind to all interfaces by default. Put a reverse proxy with TLS in front for production
 
 ---
@@ -233,7 +186,6 @@ sudo systemctl restart ai-stack.service
 
 | Guide | What it covers |
 |-------|----------------|
-| [post-install.md](post-install.md) | Open WebUI admin panel setup (connections, tools, pipelines) |
-| [model-guide.md](model-guide.md) | Model recommendations for Intel Arc iGPU, Smart Router routing |
-| [khoj-setup.md](khoj-setup.md) | Khoj / Obsidian vault integration |
+| [model-guide.md](model-guide.md) | Model recommendations for Intel Arc iGPU |
+| [retriever-guide.md](retriever-guide.md) | Configuring OpenCode to search your Obsidian vault |
 | [troubleshooting.md](troubleshooting.md) | Common issues and how to fix them |
diff --git a/docs/khoj-setup.md b/docs/khoj-setup.md
deleted file mode 100644
index 79eebe3..0000000
--- a/docs/khoj-setup.md
+++ /dev/null
@@ -1,122 +0,0 @@
-# Khoj setup
-
-Khoj is your AI second brain — it indexes your Obsidian vault and lets you ask questions
-about your notes using your local Ollama models. No data leaves your machine.
-
-Khoj runs at `http://localhost:42110` (or your configured `KHOJ_PORT`).
-
----
-
-## Prerequisites
-
-Before starting, make sure `nomic-embed-text` is pulled — Khoj uses it for embeddings:
-
-```bash
-docker exec ollama-arc ollama pull nomic-embed-text:latest
-```
-
----
-
-## First login
-
-Navigate to `http://localhost:42110` and log in with the credentials from your `.env`:
-
-- **Email:** `KHOJ_ADMIN_EMAIL`
-- **Password:** `KHOJ_ADMIN_PASSWORD`
-
----
-
-## Get your API key
-
-1. Go to `http://localhost:42110/settings`
-2. Under **API Keys**, create a new key
-3. Copy it — you'll need it for the Obsidian plugin
-
----
-
-## Obsidian plugin setup
-
-1. Open Obsidian → **Settings → Community Plugins → Browse**
-2. Search for **Khoj** and install it
-3. Enable the plugin
-4. Open the Khoj plugin settings and configure:
-
-| Setting | Value |
-|---------|-------|
-| Server URL | `http://localhost:42110` |
-| API Key | your key from the step above |
-
-5. Click **Force Sync** to index your vault immediately
-
-By default Khoj will auto-sync your vault periodically. Force Sync triggers it on demand.
-
----
-
-## Using Khoj
-
-**From Obsidian:**
-- Click the Khoj chat icon 💬 in the ribbon
-- Or run `Khoj: Chat` from the Command Palette
-- Ask questions like "what did I write about DNS last week?" or "summarize my notes on Cascade STEAM"
-
-**Find similar notes:**
-- Run `Khoj: Find Similar Notes` from the Command Palette to see notes related to the one you're currently viewing
-
-**From the browser:**
-- Go to `http://localhost:42110` for the full Khoj web interface
-- Create custom agents with specific knowledge bases from subsets of your vault
-
----
-
-## Choosing a chat model
-
-By default Khoj uses Ollama via `OPENAI_BASE_URL=http://ollama-arc:11434/v1/`. To set which
-model Khoj uses for chat:
-
-1. Go to `http://localhost:42110/settings`
-2. Under **Chat Models**, add or select a model
-3. Use any model name from your Ollama instance (e.g. `gemma3:12b`, `qwen2.5:14b`)
-
-`gemma3:12b` is recommended for Khoj — its long context window handles large vault searches well.
-
----
-
-## Vault path
-
-Your Obsidian vault is mounted read-only into the Khoj container at `/vault`. The path on
-your host is set by `OBSIDIAN_VAULT_PATH` in `.env`.
-
-If you change the vault path, update `.env` and restart the stack:
-
-```bash
-sudo systemctl restart ai-stack.service
-```
-
----
-
-## Troubleshooting
-
-**Khoj won't start:**
-```bash
-docker logs khoj --tail 30
-docker logs khoj-db --tail 10
-```
-Most startup failures are database connection issues — ensure `khoj-db` is healthy first.
-
-**Vault not indexing:**
-- Confirm `OBSIDIAN_VAULT_PATH` in `.env` points to a real directory
-- Check the vault is mounted: `docker exec khoj ls /vault`
-- Trigger a manual sync from the Obsidian plugin settings
-
-**nomic-embed-text errors:**
-```bash
-docker exec ollama-arc ollama list | grep nomic
-# If missing:
-docker exec ollama-arc ollama pull nomic-embed-text:latest
-```
-
-**Khoj can't reach Ollama:**
-Both containers are on `ai-net` — verify:
-```bash
-docker exec khoj curl -s http://ollama-arc:11434/api/tags | python3 -m json.tool | head -5
-```
diff --git a/docs/model-guide.md b/docs/model-guide.md
index e582a8f..0d9bec8 100644
--- a/docs/model-guide.md
+++ b/docs/model-guide.md
@@ -27,7 +27,7 @@ Recommendations for Intel Arc iGPU with your available RAM.
 
 **qwen3.5:14b** — The latest Qwen generation with improved reasoning and tool calling. Recommended as the default model if your RAM allows alongside the rest of the stack.
 
-**qwen2.5:14b** — Still solid at tool calling in Open WebUI. Falls back to this if qwen3.5 isn't available or you need the smaller footprint.
+**qwen2.5:14b** — Still solid at tool calling. Falls back to this if qwen3.5 isn't available or you need the smaller footprint.
 
 **qwen2.5-coder:14b** — Optimised for code and config work. Understands YAML, Dockerfiles, systemd units, bash. Better than the base model for anything involving file structure or shell commands.
 
@@ -35,7 +35,7 @@ Recommendations for Intel Arc iGPU with your available RAM.
 
 **gemma3:12b** — Long context window, good at summarising large log files or documents. Less reliable at tool calling than qwen2.5.
 
-**nomic-embed-text** — Lightweight embedding model needed if you use Open WebUI's knowledge base / RAG features with your documents.
+**nomic-embed-text** — Lightweight embedding model used by the retriever service for vault RAG.
 
 ---
 
@@ -92,14 +92,6 @@ Common candidates for removal if not in your workflow:
 
 ---
 
-## Smart Model Router routing table
+## Model routing
 
-The `smart_model_router` pipeline automatically routes queries to the best model:
-
-| Trigger keywords | Model selected |
-|-----------------|----------------|
-| health, status, check, monitor, alert, ollama, docker, gpu, vram | `qwen2.5:14b` |
-| script, bash, yaml, compose, dockerfile, code, error, install | `qwen2.5-coder:14b` |
-| why, root cause, analyze, optimize, performance, architecture | `deepseek-r1:14b` |
-| logs, summarize, document, report, explain this | `gemma3:12b` |
-| (anything else) | `qwen2.5:14b` (default) |
+OpenCode handles model selection per conversation — choose the right model for each task. For automatic routing, configure multiple providers in your OpenCode config or use the Olla load balancer for priority-based routing.
diff --git a/docs/post-install.md b/docs/post-install.md
deleted file mode 100644
index df563e7..0000000
--- a/docs/post-install.md
+++ /dev/null
@@ -1,144 +0,0 @@
-# Post-install configuration guide
-
-The installer handles everything that can be automated. These steps must be completed manually in the Open WebUI admin panel.
-
-Open WebUI: http://localhost:3000 (or your configured WEBUI_PORT)
-
----
-
-## 1. First login
-
-On first visit, create your admin account. The first account registered becomes the admin.
-
----
-
-## 2. Connections
-
-Go to **Admin Panel → Settings → Connections**
-
-**Ollama API**
-- URL: `http://ollama-arc:11434`
-- Toggle: enabled
-
-**Pipelines** (add via the + button next to OpenAI API)
-- URL: `http://pipelines:9099`
-- API Key: value from your `.env` PIPELINES_API_KEY
-
----
-
-## 3. Open Terminal
-
-Go to **Admin Panel → Settings → Integrations**
-
-- Enable the Open Terminal toggle
-- URL: `http://open-terminal:8000`
-- Auth: Bearer → value from your `.env` OPEN_TERMINAL_API_KEY
-
----
-
-## 4. Install System Diagnostics tool
-
-Go to **Admin Panel → Tools → + (Create Tool)**
-
-Paste the full contents of `tools/system_diagnostics.py` from this repo into the editor and save.
-
-Edit `OLLAMA_INSTANCES` near the top of the tool to match your setup:
-```python
-OLLAMA_INSTANCES = {
-    "local": "http://ollama-arc:11434",      # always present
-    # "remote1": "http://10.0.0.X:11434",   # add remote machines here
-}
-```
-
-> **Why manual?** Tools are stored in Open WebUI's database, not on the filesystem.
-> The installer can only deploy pipeline files — tools must be pasted through the UI.
-
----
-
-## 5. Enable tools on models
-
-Go to **Admin Panel → Models** → edit each model → under **Tools** check:
-- System Diagnostics
-- File Scout (if installed)
-
-Do this for: `deepseek-r1:14b`, `gemma4:27b`, `mistral-small3.2:24b`, `qwen3.5:14b`, `qwen2.5-coder:14b`, `gemma3:12b`, `qwen2.5:14b`
-
----
-
-## 5. Verify pipelines
-
-Go to **Admin Panel → Settings → Pipelines**
-
-You should see `http://pipelines:9099` listed and the following modules loaded:
-- `events_pipeline`
-- `python_code_pipeline`
-- `smart_model_router`
-
-If `smart_model_router` is missing, run:
-```bash
-docker exec pipelines rm -rf /app/pipelines/__pycache__
-docker cp pipelines/smart_model_router.py pipelines:/app/pipelines/smart_model_router.py
-docker restart pipelines
-```
-
----
-
-## 6. Create a sysadmin model (optional but recommended)
-
-Go to **Admin Panel → Models → Manage → Create a model**
-
-- Name: `Sysadmin`
-- JSON: `{"model": "my-sysadmin", "from": "qwen2.5:14b"}`
-
-Then edit the created model and set:
-
-**System Prompt:**
-```
-You are a sysadmin assistant for this machine (Intel Arc iGPU, Linux/Docker homelab).
-You have access to System Diagnostics and File Scout tools.
-Always use these tools when asked about system status, health checks, loaded models, or infrastructure.
-Never write code to simulate tool calls — always call the actual tool.
-Be concise and technical.
-```
-
-**Tools:** enable System Diagnostics and File Scout
-
----
-
-## 7. Add remote Ollama instances (optional)
-
-If you have other machines running Ollama on your LAN, edit the `OLLAMA_INSTANCES` dict in `tools/system_diagnostics.py` and reinstall:
-
-```python
-OLLAMA_INSTANCES = {
-    "local": "http://ollama-arc:11434",
-    "remote1": "http://10.0.0.X:11434",   # your remote machine IP
-}
-```
-
-Then update the tool in Open WebUI via **Admin Panel → Tools → System Diagnostics**.
-
----
-
-## 8. Smart Model Router
-
-The router runs as a filter pipeline. In v0.9.x it appears in the Pipelines valve settings at **Admin Panel → Settings → Pipelines**.
-
-To enable debug mode (shows which model was selected):
-- Expand `smart_model_router` in the Pipelines valves
-- Set `debug` to `true`
-
-To use the router in a chat, select any model — the router intercepts the request and overrides the model based on query content:
-
-| Query type | Routed to |
-|------------|-----------|
-| Health checks, diagnostics, tool calls | `qwen2.5:14b` |
-| Scripts, configs, code | `qwen2.5-coder:14b` |
-| Root cause analysis, complex reasoning | `deepseek-r1:14b` |
-| Log analysis, summaries, documentation | `gemma3:12b` |
-
----
-
-## Troubleshooting
-
-See `docs/troubleshooting.md` for common issues and solutions.
diff --git a/docs/retriever-guide.md b/docs/retriever-guide.md
new file mode 100644
index 0000000..9ccb708
--- /dev/null
+++ b/docs/retriever-guide.md
@@ -0,0 +1,141 @@
+# Retriever — Obsidian Vault RAG
+
+The retriever replaces Khoj + PostgreSQL as a lightweight, API-only RAG service for your Obsidian vault. It uses sqlite-vec (file-based, no separate DB) and hybrid search (keyword + vector) for fast, accurate retrieval.
+
+---
+
+## How it works
+
+```
+Obsidian vault (on host, :ro)
+       │
+       ▼
+retriever container :42000
+  ├── watchdog scans for file changes (inotify)
+  ├── embeds chunks via Olla → ollama-arc (nomic-embed-text)
+  ├── stores vectors in sqlite-vec (embedded in SQLite)
+  └── indexes keywords in FTS5 (BM25 scoring)
+
+Search:
+  POST /search {"query": "what did I write about DNS?"}
+  → hybrid search (vector + keyword, RRF fusion)
+  → top 10 chunks with scores
+```
+
+Configuration via `.env`:
+
+```
+RETRIEVER_PORT=42000
+RETRIEVER_VAULT_PATH=/home/you/obsidian
+RETRIEVER_EMBED_MODEL=nomic-embed-text
+RETRIEVER_CHUNK_SIZE=512
+RETRIEVER_CHUNK_OVERLAP=64
+```
+
+---
+
+## API
+
+### `GET /health`
+
+```bash
+curl localhost:42000/health
+```
+
+Returns:
+```json
+{
+  "status": "ok",
+  "indexed_files": 1240,
+  "total_chunks": 5420,
+  "vault_watching": true,
+  "vault_path": "/vault",
+  "is_indexing": false
+}
+```
+
+### `POST /search`
+
+```bash
+curl -X POST localhost:42000/search \
+  -H 'Content-Type: application/json' \
+  -d '{"query": "what did I write about networking?", "top_k": 10}'
+```
+
+Returns:
+```json
+{
+  "results": [
+    {
+      "filepath": "networking/dns-notes.md",
+      "chunk_index": 2,
+      "content": "...",
+      "parent_heading": "DNS Configuration",
+      "score": 0.921
+    }
+  ]
+}
+```
+
+### `POST /reindex`
+
+Force a full reindex:
+
+```bash
+curl -X POST localhost:42000/reindex
+```
+
+Returns immediately — reindexing runs in the background.
+
+---
+
+## Using with OpenCode
+
+OpenCode calls the retriever as a native tool via the project-level `.opencode/tools/vault-search.ts`. This tool is automatically available when you run `opencode` from the project directory.
+
+Two tools are provided:
+
+- **`vault-search`** — search the entire vault for notes matching a query
+- **`vault-search_per_source`** — search within a specific file or subdirectory
+
+Both tools call `POST /search` on the retriever API and return file paths, content snippets, and relevance scores.
+
+The tools are pre-configured in `.opencode/config.json` and auto-approved by default. No manual setup is needed.
+
+---
+
+## Performance tuning
+
+| Setting | Effect | Default |
+|---------|--------|---------|
+| `CHUNK_SIZE` | Max characters per chunk. Smaller = more precise, larger = more context. | 512 |
+| `CHUNK_OVERLAP` | Overlap between chunks. Helps with boundary crossing. | 64 |
+| `EMBED_MODEL` | Embedding model used by ollama-arc. `nomic-embed-text` is fast and small. | nomic-embed-text |
+
+For a very large vault:
+- **Chunk size**: 512-768 works well for most notes. Increase to 1024 if notes are long-form.
+- **Embedding model**: `nomic-embed-text` (768-dim, 274MB) is fast on iGPU. For better accuracy, try `mxbai-embed-large` (1024-dim, 334MB).
+- **Re-indexing**: Trigger `POST /reindex` after bulk imports. The watchdog handles incremental changes in real-time.
+
+---
+
+## Troubleshooting
+
+**Retriever won't start:**
+```bash
+docker logs retriever --tail 30
+```
+
+**Vault not indexing:**
+- Verify `RETRIEVER_VAULT_PATH` in `.env` points to a real directory
+- Check the mount: `docker exec retriever ls /vault`
+- Check Olla is healthy: `curl localhost:40114/internal/health`
+
+**nomic-embed-text errors:**
+```bash
+docker exec ollama-arc ollama pull nomic-embed-text:latest
+```
+
+**Search returns no results:**
+- Check `curl localhost:42000/health` — if `indexed_files` is 0, the vault isn't populated or the path is wrong
+- If indexing is running, wait for it to complete
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index 288f174..66d239b 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -23,10 +23,8 @@ docker logs ollama-arc 2>&1 | grep -i "device\|gpu\|arc\|oneapi"
    Manual fix:
    ```bash
    ls -la /dev/dri/
-   # Find which card is Intel (vendor 0x8086)
    cat /sys/class/drm/card0/device/vendor
    cat /sys/class/drm/card1/device/vendor
-   # Update .env with the correct card
    ```
 
 2. **Container started without GPU access** — If the card node drifted and the service started before `check-arc-gpu.sh` updated `.env`:
@@ -38,105 +36,95 @@ docker logs ollama-arc 2>&1 | grep -i "device\|gpu\|arc\|oneapi"
 
 ---
 
-## Ollama not reachable from Open WebUI
+## Retriever
 
-**Symptom:** "Trouble accessing Ollama" warning in Open WebUI connections.
+### Retriever shows 0 indexed files
 
 **Check:**
 ```bash
-docker exec -it open-webui curl http://ollama-arc:11434/api/tags
+curl localhost:42000/health
+docker logs retriever --tail 20
 ```
 
-**Fix:** Verify both containers are on the `ai-net` network:
-```bash
-docker network inspect ai-stack_ai-net | grep -A3 "Name"
-```
+**Common causes:**
 
----
+1. **Vault path missing** — Verify `RETRIEVER_VAULT_PATH` in `.env` points to a real directory that contains `.md` files.
+
+2. **Embeddings failing** — The retriever needs Olla healthy and `nomic-embed-text` pulled:
+   ```bash
+   curl localhost:40114/internal/health
+   docker exec ollama-arc ollama list | grep nomic
+   docker exec ollama-arc ollama pull nomic-embed-text:latest
+   ```
 
-## Remote Ollama instance unreachable from tools
+3. **Vault not mounted** — Check the container:
+   ```bash
+   docker exec retriever ls /vault
+   ```
 
-**Symptom:** System Diagnostics reports remote instance as unreachable, but you can ping/curl it from the host.
+---
 
-**Root cause:** Tools run inside the Open WebUI container's Python process. The container needs a route to your LAN.
+## Olla
+
+### Olla not routing correctly
 
 **Check:**
 ```bash
-docker exec -it open-webui python3 -c "
-import httpx, asyncio
-async def test():
-    async with httpx.AsyncClient(timeout=5) as c:
-        r = await c.get('http://YOUR_REMOTE_IP:11434/api/tags')
-        print('OK', r.status_code)
-asyncio.run(test())
-"
+curl localhost:40114/internal/status/endpoints
 ```
 
-**Fix:** If the above fails, add `extra_hosts` to open-webui in `docker-compose.yml`:
-```yaml
-open-webui:
-  extra_hosts:
-    - "host.docker.internal:host-gateway"
+**Fix:** Regenerate config after changing `.env`:
+```bash
+bash scripts/generate-olla-config.sh
+sudo systemctl restart ai-stack.service
 ```
 
 ---
 
-## Pipelines crash loop
+## LiteLLM
 
-**Symptom:** `docker logs pipelines` shows repeated startup failures.
+### LiteLLM won't start / cloud models unavailable
 
-**Most common cause:** A pipeline file is missing required configuration (e.g. a GitHub token).
-
-**Fix:**
+**Check logs:**
 ```bash
-# Find the offending pipeline
-docker logs pipelines 2>&1 | grep "ERROR\|ValueError"
-
-# Remove it
-docker exec pipelines rm -rf /app/pipelines/PROBLEM_PIPELINE.py
-docker exec pipelines rm -rf /app/pipelines/PROBLEM_PIPELINE/
-docker exec pipelines rm -rf /app/pipelines/__pycache__
-docker restart pipelines
+docker logs litellm --tail 30
 ```
 
----
-
-## Pipeline changes not taking effect
-
-**Symptom:** Updated a `.py` file but old behaviour persists.
+**Common causes:**
 
-**Cause:** Python bytecode cache (`__pycache__`) is stale.
+1. **Missing API keys** — `ANTHROPIC_API_KEY` and `GEMINI_API_KEY` must be set in `.env`. LiteLLM will start without them but cloud models won't be available.
 
-**Fix:** Always clear cache after updating pipeline files:
-```bash
-docker exec pipelines rm -rf /app/pipelines/__pycache__
-docker restart pipelines
-```
+2. **Healthcheck** — LiteLLM's liveness endpoint is at `/health/liveness` (not `/health`). The healthcheck in compose uses the correct URL.
 
 ---
 
-## Models not showing in Open WebUI after restart
+### Remote instance unreachable
 
-**Symptom:** Chat model selector is empty after system reboot.
+**Symptom:** Olla reports a remote node as unreachable.
 
-**Cause:** Open WebUI caches the model list and the cache can go stale.
-
-**Fix:**
+**Check:**
 ```bash
-docker restart open-webui
+curl http://192.168.1.X:11434/api/tags
+```
+
+**Fix:** Ensure the remote host is reachable from the Docker network. If the host is on the LAN, Olla should be able to reach it. For host-local addresses, you may need `extra_hosts`:
+
+```yaml
+olla:
+  extra_hosts:
+    - "host.docker.internal:host-gateway"
 ```
-Then refresh the browser.
 
 ---
 
 ## `docker compose down` fails with "invalid hostPort"
 
-**Cause:** A port mapping in the compose file has a typo (e.g. `" :8000"` instead of `"8000:8000"`).
+**Cause:** A port mapping in the compose file has a typo.
 
-**Fix:** Stop containers by name since compose can't parse the file:
+**Fix:** Stop containers by name:
 ```bash
-docker stop open-webui open-terminal pipelines ollama-arc
-docker rm open-webui open-terminal pipelines ollama-arc
+docker stop ollama-arc litellm olla retriever
+docker rm ollama-arc litellm olla retriever
 ```
 Then fix the typo in `docker-compose.yml` and restart.
 
@@ -154,13 +142,3 @@ journalctl -xeu ai-stack.service
 1. Docker not ready yet — the `After=docker.service` dependency usually handles this, but on slow systems add `sleep 5` to ExecStartPre.
 2. GPU pre-flight failed — check `check-arc-gpu.sh` output in the journal.
 3. Port conflict — another service is using one of your configured ports.
-
----
-
-## open-webui volume issues after migration
-
-If you moved from a separate compose setup, the `open-webui` volume is marked `external: true` and must exist before the stack starts:
-
-```bash
-docker volume create open-webui
-```
diff --git a/install.sh b/install.sh
index f457a3a..89f7da0 100644
--- a/install.sh
+++ b/install.sh
@@ -39,7 +39,7 @@ fi
 if grep -q '<vaultwarden:' "${SCRIPT_DIR}/.env" 2>/dev/null; then
     info "Resolving VaultWarden placeholders in .env..."
     if [[ -f "${SCRIPT_DIR}/scripts/resolve-vaultwarden.sh" ]]; then
-        bash "${SCRIPT_DIR}/scripts/resolve-vaultwarden.sh" --in-place
+        bash "${SCRIPT_DIR}/scripts/resolve-vaultwarden.sh"
     else
         warn "resolve-vaultwarden.sh not found, sourcing .env as-is"
     fi
@@ -115,14 +115,19 @@ check_memory() {
     fi
 }
 
-check_obsidian_vault() {
-    local vault_path="${OBSIDIAN_VAULT_PATH:-}"
+check_vault() {
+    local vault_path="${RETRIEVER_VAULT_PATH:-}"
     if [[ -z "$vault_path" ]]; then
-        warn "OBSIDIAN_VAULT_PATH not set in .env — Khoj will start but vault indexing will be disabled."
-        warn "Set OBSIDIAN_VAULT_PATH in .env and restart to enable vault search."
+        warn "RETRIEVER_VAULT_PATH not set in .env — retriever will start but vault indexing will be disabled."
     elif [[ ! -d "$vault_path" ]]; then
-        warn "OBSIDIAN_VAULT_PATH=${vault_path} does not exist — Khoj will start but vault won't be mounted."
-        warn "Create the directory or fix the path in .env."
+        warn "RETRIEVER_VAULT_PATH=${vault_path} does not exist."
+        read -rp "Create this directory now? [Y/n] " create_vault
+        if [[ ! "${create_vault,,}" =~ ^n ]]; then
+            mkdir -p "$vault_path"
+            success "Created vault directory: ${vault_path}"
+        else
+            warn "Vault directory not created — retriever will start but vault won't be mounted."
+        fi
     else
         success "Obsidian vault found: ${vault_path}"
     fi
@@ -132,16 +137,16 @@ check_docker
 check_docker_group
 check_intel_gpu
 check_memory
-check_obsidian_vault
+check_vault
 
 # ─── Create docker volumes ────────────────────────────────────────────────────
 header "Docker Volumes"
 
-if ! docker volume inspect open-webui &>/dev/null; then
-    docker volume create open-webui
-    success "Created docker volume: open-webui"
+if ! docker volume inspect ai-stack_retriever-data &>/dev/null; then
+    docker volume create ai-stack_retriever-data
+    success "Created docker volume: retriever-data"
 else
-    success "Docker volume open-webui already exists."
+    success "Docker volume retriever-data already exists."
 fi
 
 # ─── Install systemd service ──────────────────────────────────────────────────
@@ -159,43 +164,7 @@ sudo systemctl daemon-reload
 sudo systemctl enable ai-stack.service
 success "Installed and enabled ai-stack.service"
 
-# ─── Install pipelines ────────────────────────────────────────────────────────
-# NOTE: Only pipeline .py files are deployed here (pipelines/ directory).
-# Tools (System Diagnostics etc.) live in Open WebUI's database and must be
-# added via post-install.sh or manually via Admin Panel → Tools.
-header "Installing Pipelines"
-
-install_pipelines() {
-    info "Starting pipelines container to install pipeline files..."
-
-    cd "${INSTALL_DIR}"
-    docker compose up -d pipelines
-    sleep 5
-
-    # Clear pycache to avoid stale bytecode
-    docker exec pipelines rm -rf /app/pipelines/__pycache__ 2>/dev/null || true
-
-    local installed=0
-    for f in "${SCRIPT_DIR}/pipelines/"*.py; do
-        [[ -f "$f" ]] || continue
-        docker cp "$f" "pipelines:/app/pipelines/$(basename "$f")"
-        success "Installed pipeline: $(basename "$f")"
-        (( installed++ )) || true
-    done
-
-    if (( installed == 0 )); then
-        warn "No pipeline files found in ${SCRIPT_DIR}/pipelines/ — skipping."
-    fi
-
-    docker exec pipelines rm -rf /app/pipelines/__pycache__ 2>/dev/null || true
-    docker restart pipelines
-    sleep 3
-
-    docker logs pipelines --tail 10 | grep -E "Loaded module|ERROR" || true
-    success "Pipelines installed and restarted."
-}
-
-install_pipelines
+# OpenCode (CLI + Obsidian sidebar plugin) is the primary AI interface.
 
 # ─── Start the full stack ─────────────────────────────────────────────────────
 header "Starting AI Stack"
@@ -231,9 +200,11 @@ if [[ "${pull_models,,}" == "y" ]]; then
 
     for model in ${MODELS_TO_PULL}; do
         info "Pulling ${model}..."
-        docker exec ollama-arc "${OLLAMA_BIN}" pull "${model}" \
-            && success "Pulled: ${model}" \
-            || warn "Failed to pull: ${model} (check container logs)"
+        if docker exec ollama-arc "${OLLAMA_BIN}" pull "${model}"; then
+            success "Pulled: ${model}"
+        else
+            warn "Failed to pull: ${model} (check container logs)"
+        fi
     done
 else
     info "Skipping model pull. Pull manually with:"
@@ -242,23 +213,363 @@ else
     done
 fi
 
+# ─── Install OpenCode ─────────────────────────────────────────────────────────
+header "OpenCode CLI"
+
+if command -v opencode &>/dev/null; then
+    success "OpenCode already installed ($(opencode --version 2>/dev/null || echo 'unknown version'))"
+else
+    info "OpenCode is the primary AI interface for this stack."
+    read -rp "Install OpenCode now? [Y/n] " install_oc
+    if [[ ! "${install_oc,,}" =~ ^n ]]; then
+        if command -v npm &>/dev/null; then
+            info "Installing via npm..."
+            npm install -g opencode-ai
+        elif command -v bun &>/dev/null; then
+            info "Installing via bun..."
+            bun install -g opencode-ai
+        else
+            info "Installing via install script..."
+            curl -fsSL https://opencode.ai/install | bash
+        fi
+        if command -v opencode &>/dev/null; then
+            success "OpenCode installed."
+        else
+            warn "OpenCode installation may need manual steps. See https://opencode.ai/docs"
+        fi
+    else
+        info "Skipping OpenCode install. Install later: curl -fsSL https://opencode.ai/install | bash"
+    fi
+fi
+
+# ─── Install Bun (needed by OpenCode Obsidian plugin) ─────────────────────────
+header "Bun Runtime"
+
+if command -v bun &>/dev/null; then
+    success "Bun already installed ($(bun --version 2>/dev/null || echo 'unknown version'))"
+else
+    info "Bun is required by the OpenCode Obsidian plugin."
+    read -rp "Install Bun now? [Y/n] " install_bun
+    if [[ ! "${install_bun,,}" =~ ^n ]]; then
+        info "Installing Bun..."
+        curl -fsSL https://bun.sh/install | bash
+        if command -v bun &>/dev/null; then
+            success "Bun installed."
+        else
+            warn "Bun installed but may need a new shell session or PATH update."
+        fi
+    else
+        info "Skipping Bun install. Install later: curl -fsSL https://bun.sh/install | bash"
+    fi
+fi
+
+# ─── Configure OpenCode with stack providers ──────────────────────────────────
+header "OpenCode Configuration"
+
+OC_CONFIG_DIR="${HOME}/.opencode"
+OC_CONFIG="${OC_CONFIG_DIR}/config.json"
+
+if command -v opencode &>/dev/null; then
+    mkdir -p "${OC_CONFIG_DIR}"
+    if [[ -f "${OC_CONFIG}" ]]; then
+        success "OpenCode config already exists at ${OC_CONFIG}"
+    else
+        info "Creating global OpenCode config with stack providers..."
+        cat > "${OC_CONFIG}" << OCEOF
+{
+  "\$schema": "https://opencode.ai/config.json",
+  "provider": {
+    "olla": {
+      "npm": "@ai-sdk/openai-compatible",
+      "name": "Olla (local Ollama cluster)",
+      "options": {
+        "baseURL": "http://localhost:40115/v1"
+      },
+      "models": {
+        "qwen3.5:14b": {
+          "name": "Qwen 3.5 14B (default)"
+        },
+        "gemma4:27b": {
+          "name": "Gemma 4 27B (heavy lifting)"
+        },
+        "mistral-small3.2:24b": {
+          "name": "Mistral Small 3.2 24B (tool calling)"
+        },
+        "qwen2.5:14b": {
+          "name": "Qwen 2.5 14B (diagnostics)"
+        },
+        "qwen2.5-coder:14b": {
+          "name": "Qwen 2.5 Coder 14B (code)"
+        },
+        "deepseek-r1:14b": {
+          "name": "DeepSeek R1 14B (reasoning)"
+        },
+        "gemma3:12b": {
+          "name": "Gemma 3 12B (longform/logs)"
+        },
+        "nomic-embed-text": {
+          "name": "Nomic Embed Text (embeddings)"
+        }
+      }
+    },
+    "litellm": {
+      "npm": "@ai-sdk/openai-compatible",
+      "name": "LiteLLM (cloud models)",
+      "options": {
+        "baseURL": "http://localhost:4000/v1"
+      },
+      "models": {
+        "claude-sonnet-4-20250514": {
+          "name": "Claude Sonnet 4 (Anthropic)"
+        },
+        "gemini-2.0-flash-001": {
+          "name": "Gemini 2.0 Flash (Google)"
+        }
+      }
+    }
+  }
+}
+OCEOF
+        success "Created OpenCode config at ${OC_CONFIG}"
+        info "You can add more models by editing ~/.opencode/config.json"
+    fi
+fi
+
+# ─── Install OpenCode Obsidian plugin ──────────────────────────────────────────
+header "OpenCode Obsidian Plugin"
+
+info "The OpenCode Obsidian plugin embeds the AI assistant in your sidebar."
+info "It needs to be installed in this vault's .obsidian/plugins directory."
+
+if command -v opencode &>/dev/null && command -v bun &>/dev/null; then
+    PLUGIN_DIR="${SCRIPT_DIR}/.obsidian/plugins/obsidian-opencode"
+    if [[ -d "${PLUGIN_DIR}" ]]; then
+        success "OpenCode Obsidian plugin already installed"
+    else
+        info "Cloning opencode-obsidian plugin..."
+        mkdir -p "${SCRIPT_DIR}/.obsidian/plugins"
+        if git clone https://github.com/growlf/opencode-obsidian.git "${PLUGIN_DIR}" 2>/dev/null; then
+            info "Building plugin..."
+            if (cd "${PLUGIN_DIR}" && bun install && bun run build) 2>/dev/null; then
+                success "OpenCode Obsidian plugin installed and built."
+                # Auto-enable in community-plugins.json
+                echo '["opencode-obsidian"]' > "${SCRIPT_DIR}/.obsidian/community-plugins.json"
+                success "Plugin enabled. Restart Obsidian to see the sidebar icon."
+            else
+                warn "Plugin build failed. Check Bun installation."
+                rm -rf "${PLUGIN_DIR}"
+            fi
+        else
+            warn "Failed to clone plugin repo. Check internet connection."
+        fi
+    fi
+else
+    warn "OpenCode CLI or Bun not installed — skipping plugin setup."
+    info "Install both first, then run:"
+    info "  git clone https://github.com/growlf/opencode-obsidian.git .obsidian/plugins/obsidian-opencode"
+    info "  cd .obsidian/plugins/obsidian-opencode && bun install && bun run build"
+fi
+
+# ─── Bitwarden / VaultWarden Secret Management (optional) ──────────────────────
+header "Bitwarden / VaultWarden"
+
+info "The stack can resolve <vaultwarden:path> placeholders in .env"
+info "using Bitwarden (or self-hosted VaultWarden) for secret management."
+info "This lets you store API keys in your vault instead of plaintext in .env."
+echo ""
+
+read -rp "Configure Bitwarden secret management? [y/N] " setup_bw
+if [[ "${setup_bw,,}" != "y" ]]; then
+    info "Skipping Bitwarden setup."
+else
+    # ── Check for existing session ─────────────────────────────────────────
+    BW_HAS_SESSION=false
+    if command -v bw &>/dev/null; then
+        bw_status=$(bw status 2>/dev/null || echo '{"status":"unauthenticated"}')
+        if echo "$bw_status" | grep -q '"status":"unlocked"'; then
+            BW_HAS_SESSION=true
+            success "Bitwarden vault already unlocked."
+        fi
+    fi
+
+    # ── Install bw CLI if missing ──────────────────────────────────────────
+    if ! command -v bw &>/dev/null; then
+        info "Installing Bitwarden CLI via npm..."
+        if ! command -v npm &>/dev/null; then
+            info "npm not found — installing Node.js..."
+            if command -v snap &>/dev/null; then
+                sudo snap install node --classic
+            elif command -v apt-get &>/dev/null; then
+                sudo apt-get update -qq && sudo apt-get install -y -qq nodejs npm
+            else
+                warn "Cannot install npm automatically."
+                info "Install Node.js manually, then run: npm install -g @bitwarden/cli"
+            fi
+        fi
+        if command -v npm &>/dev/null; then
+            npm install -g @bitwarden/cli
+            if command -v bw &>/dev/null; then
+                success "Bitwarden CLI installed."
+            else
+                warn "bw CLI install may need a new shell or PATH update."
+            fi
+        fi
+    fi
+
+    if ! command -v bw &>/dev/null; then
+        warn "bw CLI not available — skipping Bitwarden configuration."
+        info "Install manually: npm install -g @bitwarden/cli"
+    elif [[ "$BW_HAS_SESSION" != "true" ]]; then
+        # ── Server URL (self-hosted VaultWarden) ─────────────────────────
+        echo ""
+        info "Are you using Bitwarden cloud (bitwarden.com) or a self-hosted VaultWarden?"
+        read -rp "Self-hosted VaultWarden URL (or leave blank for Bitwarden cloud): " BW_SERVER_URL_VAL
+        if [[ -n "${BW_SERVER_URL_VAL}" ]]; then
+            if [[ "${BW_SERVER_URL_VAL,,}" != https://* ]]; then
+                warn "URL must use HTTPS. Prepending https://"
+                BW_SERVER_URL_VAL="https://${BW_SERVER_URL_VAL}"
+            fi
+            bw config server "$BW_SERVER_URL_VAL" >/dev/null 2>&1
+            success "VaultWarden server configured: ${BW_SERVER_URL_VAL}"
+        fi
+
+        # ── Login ────────────────────────────────────────────────────────
+        echo ""
+        info "Log in to Bitwarden now. Your master password is used only for this"
+        info "one-time login and will NOT be stored anywhere."
+        read -rp "Bitwarden email: " BW_EMAIL
+        read -rsp "Master password (not stored): " BW_MASTER_PW
+        echo ""
+
+        export BW_CLIENT_ID=""
+        export BW_CLIENT_SECRET=""
+        BW_SESSION=$(echo "$BW_MASTER_PW" | bw login "$BW_EMAIL" --raw 2>/dev/null || true)
+        BW_MASTER_PW=""
+        if [[ -z "$BW_SESSION" ]]; then
+            warn "Login failed. You may have 2FA enabled."
+            info "Run 'bw login $BW_EMAIL' manually in another terminal, then re-run install.sh."
+        else
+            success "Logged in as ${BW_EMAIL}."
+            export BW_SESSION
+            bw sync >/dev/null 2>&1
+        fi
+    fi
+
+    if command -v bw &>/dev/null; then
+        # ── Organization ID ──────────────────────────────────────────────
+        echo ""
+        info "You need a Bitwarden organization ID to scope secret lookups."
+        info "Find it by logging into the Bitwarden web vault → Settings → Organizations."
+        echo ""
+        read -rp "Bitwarden Organization ID (leave blank to skip): " BW_ORG_ID
+
+        if [[ -n "${BW_ORG_ID}" ]]; then
+            # ── API key setup ───────────────────────────────────────────
+            echo ""
+            info "Generate a Bitwarden API key for non-interactive secret resolution:"
+            info "  Web vault → Settings → Security → Keys tab → View API Key"
+            info "  (Enter your master password to view, then copy the values.)"
+            echo ""
+            read -rp "BW_CLIENT_ID (e.g. user.xxxxxx): " BW_CLIENT_ID_VAL
+            read -rsp "BW_CLIENT_SECRET: " BW_CLIENT_SECRET_VAL
+            echo ""
+
+            # Remove any existing LITELLM_MASTER_KEY from .env (avoid duplicates)
+            if grep -q '^LITELLM_MASTER_KEY=' "${SCRIPT_DIR}/.env" 2>/dev/null; then
+                sed -i '/^LITELLM_MASTER_KEY=/d' "${SCRIPT_DIR}/.env"
+                info "Removed existing LITELLM_MASTER_KEY from .env (will be replaced)."
+            fi
+
+            # ── Write to .env ───────────────────────────────────────────
+            if [[ -n "${BW_SERVER_URL_VAL:-}" ]]; then
+                echo "BW_SERVER_URL=${BW_SERVER_URL_VAL}" >> .env
+            fi
+            {
+                echo ""
+                echo "# ─── Bitwarden / VaultWarden (added by install.sh) ─────────────────"
+                echo "BW_CLIENT_ID=${BW_CLIENT_ID_VAL}"
+                echo "BW_CLIENT_SECRET=${BW_CLIENT_SECRET_VAL}"
+                echo ""
+                echo "# Secrets stored in Bitwarden — resolved via resolve-vaultwarden.sh"
+                echo "# Format: <vaultwarden:org-id/item-name>"
+                echo "ANTHROPIC_API_KEY=<vaultwarden:${BW_ORG_ID}/anthropic-api-key>"
+                echo "GEMINI_API_KEY=<vaultwarden:${BW_ORG_ID}/gemini-api-key>"
+                echo "LITELLM_MASTER_KEY=<vaultwarden:${BW_ORG_ID}/litellm-master-key>"
+            } >> .env
+
+            # ── Auto-generate LiteLLM key and store in Bitwarden ────────
+            if command -v bw &>/dev/null; then
+                LITELLM_KEY="sk-$(openssl rand -hex 24 2>/dev/null || head -c32 < /dev/urandom | xxd -p -c64)"
+                litellm_item=$(bw list items --search "litellm-master-key" --organizationid "$BW_ORG_ID" --session "$BW_SESSION" 2>/dev/null | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+for item in (data if isinstance(data, list) else []):
+    if item.get('name') == 'litellm-master-key':
+        print(item['id'])
+" 2>/dev/null || true)
+                if [[ -n "$litellm_item" ]]; then
+                    info "Updating existing litellm-master-key in vault..."
+                    bw get item "$litellm_item" --session "$BW_SESSION" 2>/dev/null | \
+                        python3 -c "
+import sys, json
+item = json.load(sys.stdin)
+item['login']['password'] = '${LITELLM_KEY}'
+print(json.dumps(item))
+" 2>/dev/null | \
+                    bw encode | \
+                    bw edit item "$litellm_item" --session "$BW_SESSION" >/dev/null 2>&1 || true
+                else
+                    info "Creating litellm-master-key in vault..."
+                    item_json=$(printf '{"organizationId":"%s","name":"litellm-master-key","type":1,"login":{"username":"litellm","password":"%s","uris":[]}}' "$BW_ORG_ID" "$LITELLM_KEY")
+                    echo "$item_json" | bw encode | bw create item --session "$BW_SESSION" >/dev/null 2>&1 || true
+                fi
+            fi
+
+            # ── Attempt resolution ──────────────────────────────────────
+            info "Attempting to resolve placeholders now..."
+            if bash "${SCRIPT_DIR}/scripts/resolve-vaultwarden.sh"; then
+                success "Placeholders resolved — secrets pulled from vault."
+            else
+                warn "Resolution incomplete. Create these items in your vault:"
+                echo "  1. ${BW_ORG_ID}/anthropic-api-key  (login item, password = API key)"
+                echo "  2. ${BW_ORG_ID}/gemini-api-key     (login item, password = API key)"
+                echo ""
+                echo "  litellm-master-key was auto-created with a generated key."
+                echo "  Then run: ./scripts/resolve-vaultwarden.sh"
+            fi
+        else
+            warn "No organization ID — skipping Bitwarden setup."
+        fi
+    fi
+fi
+
 # ─── Done ─────────────────────────────────────────────────────────────────────
 header "Installation Complete"
 
-WEBUI_PORT="${WEBUI_PORT:-3000}"
-KHOJ_PORT="${KHOJ_PORT:-42110}"
+OLLA_PORT="${OLLA_PORT:-40114}"
+RETRIEVER_PORT="${RETRIEVER_PORT:-42000}"
 
 echo -e "${GREEN}${BOLD}Stack is running!${RESET}"
 echo ""
-echo -e "  Open WebUI:  ${BOLD}http://localhost:${WEBUI_PORT}${RESET}"
-echo -e "  Ollama API:  ${BOLD}http://localhost:${OLLAMA_PORT:-11434}${RESET}"
-echo -e "  Pipelines:   ${BOLD}http://localhost:${PIPELINES_PORT:-9099}${RESET}"
-echo -e "  Khoj:        ${BOLD}http://localhost:${KHOJ_PORT}${RESET}"
+echo -e "  Olla (router): ${BOLD}http://localhost:${OLLA_PORT}${RESET}"
+echo -e "  Retriever:     ${BOLD}http://localhost:${RETRIEVER_PORT}/health${RESET}"
+echo -e "  Ollama API:    ${BOLD}http://localhost:${OLLAMA_PORT:-11434}${RESET}"
+echo -e "  LiteLLM UI:    ${BOLD}http://localhost:${LITELLM_PORT:-4000}/ui${RESET}"
 echo ""
 echo -e "${YELLOW}Next steps:${RESET}"
 echo ""
-echo -e "  1. Run ${BOLD}./post-install.sh${RESET} to auto-configure Open WebUI"
-echo -e "  2. Follow ${BOLD}docs/khoj-setup.md${RESET} to connect Obsidian to Khoj"
+echo -e "  ${BOLD}Obsidian setup:${RESET}"
+echo -e "    1. Open Obsidian"
+echo -e "    2. Click 'Open folder as vault' (or 'Manage vaults' → 'Open')"
+echo -e "    3. Select this project folder: ${BOLD}${SCRIPT_DIR}${RESET}"
+echo -e "    4. Go to Settings → Community Plugins → enable ${BOLD}OpenCode${RESET}"
+echo -e "    5. Click the terminal icon in the sidebar (or Ctrl+Shift+O)"
+echo ""
+echo -e "  ${BOLD}RAG / vault search:${RESET}"
+echo -e "    The retriever indexes notes at: ${BOLD}RETRIEVER_VAULT_PATH${RESET}"
+echo -e "    Currently configured as: ${BOLD}${RETRIEVER_VAULT_PATH:-/home/${STACK_USER}/obsidian}${RESET}"
+echo -e "    If your notes live elsewhere, update RETRIEVER_VAULT_PATH in .env"
+echo -e "    Then restart the stack and use OpenCode to search your vault."
 echo ""
-echo -e "  Full guide: ${BOLD}docs/post-install.md${RESET}"
+echo -e "  ${BOLD}Need help?${RESET}  docs/retriever-guide.md  |  docs/troubleshooting.md"
 
diff --git a/pipelines/smart_model_router.py b/pipelines/smart_model_router.py
deleted file mode 100644
index 84129a7..0000000
--- a/pipelines/smart_model_router.py
+++ /dev/null
@@ -1,125 +0,0 @@
-"""
-title: Smart Model Router
-description: Auto-routes queries to the best local model based on content analysis.
-             Diagnostics → qwen2.5:14b, Scripting → qwen2.5-coder:14b,
-             Reasoning → deepseek-r1:14b, Longform → gemma3:12b
-version: 0.3.1
-"""
-
-from pydantic import BaseModel, Field
-from typing import Optional
-import re
-
-
-class Pipeline:
-    class Valves(BaseModel):
-        diagnostics_model: str = Field(
-            default="qwen2.5:14b",
-            description="Tool calling, health checks, system status, alerts"
-        )
-        scripting_model: str = Field(
-            default="qwen2.5-coder:14b",
-            description="Configs, scripts, shell commands, code"
-        )
-        reasoning_model: str = Field(
-            default="deepseek-r1:14b",
-            description="Complex troubleshooting, root cause analysis"
-        )
-        longform_model: str = Field(
-            default="gemma3:12b",
-            description="Long logs, summaries, documentation"
-        )
-        debug: bool = Field(
-            default=False,
-            description="Prepend routing decision to each response"
-        )
-
-    def __init__(self):
-        self.name = "Smart Model Router"
-        self.type = "filter"
-        self.id = "smart_model_router"
-        self.valves = self.Valves()
-
-    async def on_startup(self):
-        print("[Router] Pipeline started")
-
-    async def on_shutdown(self):
-        print("[Router] Pipeline stopped")
-
-    def _classify(self, text: str) -> tuple[str, str]:
-        """Classify query and return (model, reason)."""
-        t = text.lower()
-
-        diagnostic_patterns = [
-            r"\b(diagnos|health|status|check|monitor|alert|reachable|unreachable|uptime)\b",
-            r"\b(system report|get_all|list models|loaded models|vram)\b",
-            r"\b(is .+ running|is .+ up|is .+ down|ping)\b",
-            r"\b(ollama|open.?webui|pipeline|container|docker)\b",
-            r"\b(gpu|cpu|memory|ram|disk usage)\b",
-        ]
-        for p in diagnostic_patterns:
-            if re.search(p, t):
-                return self.valves.diagnostics_model, "diagnostics"
-
-        scripting_patterns = [
-            r"\b(script|bash|shell|command|cron|systemd|service|config)\b",
-            r"\b(yaml|compose|dockerfile|ansible|terraform)\b",
-            r"\b(fix|debug|error|traceback|exception|failed|exit code)\b",
-            r"\b(install|setup|configure|deploy|update|upgrade)\b",
-            r"\b(python|javascript|typescript|code|function|class|import)\b",
-        ]
-        for p in scripting_patterns:
-            if re.search(p, t):
-                return self.valves.scripting_model, "scripting"
-
-        reasoning_patterns = [
-            r"\b(why|root cause|explain|analyze|compare|optimize|recommend)\b",
-            r"\b(should i|what would you|best approach|pros and cons)\b",
-            r"\b(performance|bottleneck|slow|latency|memory leak|high cpu)\b",
-            r"\b(architecture|design|strategy|best practice|tradeoff)\b",
-        ]
-        for p in reasoning_patterns:
-            if re.search(p, t):
-                return self.valves.reasoning_model, "reasoning"
-
-        longform_patterns = [
-            r"\b(log|logs|summarize|summary|document|report)\b",
-            r"\b(what does this mean|walk me through|step by step|explain this)\b",
-            r"\b(write a|draft a|create a document|generate a report)\b",
-        ]
-        for p in longform_patterns:
-            if re.search(p, t):
-                return self.valves.longform_model, "longform"
-
-        # Default — diagnostics model handles most sysadmin queries well
-        return self.valves.diagnostics_model, "default"
-
-    async def inlet(self, body: dict, user: Optional[dict] = None) -> dict:
-        """Intercept request, classify, and override model before it goes out."""
-        messages = body.get("messages", [])
-        if not messages:
-            return body
-
-        # Find the last user message
-        user_message = ""
-        for m in reversed(messages):
-            if m.get("role") == "user":
-                user_message = m.get("content", "")
-                break
-
-        if not user_message:
-            return body
-
-        model, reason = self._classify(user_message)
-        body["model"] = model
-        print(f"[Router] '{user_message[:80]}' → {model} ({reason})")
-
-        if self.valves.debug:
-            debug_msg = f"[Router → {model} ({reason})]"
-            if messages and messages[0].get("role") == "system":
-                messages[0]["content"] = debug_msg + "\n" + messages[0]["content"]
-            else:
-                messages.insert(0, {"role": "system", "content": debug_msg})
-            body["messages"] = messages
-
-        return body
diff --git a/post-install.sh b/post-install.sh
deleted file mode 100755
index cffc45f..0000000
--- a/post-install.sh
+++ /dev/null
@@ -1,638 +0,0 @@
-#!/usr/bin/env bash
-# ─── post-install.sh ──────────────────────────────────────────────────────────
-# Configures Open WebUI, LiteLLM, and Khoj after the stack comes up.
-# Uses OLLAMA_REMOTE_* entries in .env as the single source of truth for
-# all Ollama instances — no node addresses are baked into this script.
-#
-# What this does (in order):
-#   1.  Generate tools/system_diagnostics.py with current instance list
-#   2.  Wait for Open WebUI to become healthy
-#   3.  Create or sign in to the admin account
-#   4.  Register Ollama connections (local arc + all OLLAMA_REMOTE_* nodes)
-#   5.  Register Pipelines connection
-#   6.  Register LiteLLM as an OpenAI-compatible connection (Claude, Gemini)
-#   7.  Register Open Terminal
-#   8.  Deploy System Diagnostics tool to Open WebUI
-#   9.  Enable System Diagnostics tool on all models
-#   10. Verify Khoj health and print Obsidian plugin setup instructions
-#   11. Configure Khoj chat models via Django shell
-#
-# Usage:
-#   ./post-install.sh              # apply all configuration
-#   ./post-install.sh --dry-run    # preview changes without applying
-#
-# Verified API prefixes (Open WebUI v0.9.x):
-#   /api/v1/auths       — signup/signin
-#   /api/v1/configs     — connections, ollama, openai
-#   /api/v1/tools       — tool management
-#   /api/v1/models      — model management
-#   /api/v1/terminals   — terminal integration
-#   /api/v1/pipelines   — pipeline management
-
-set -euo pipefail
-
-RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'
-BLUE='\033[0;34m'; CYAN='\033[0;36m'; BOLD='\033[1m'; RESET='\033[0m'
-
-info()    { echo -e "${BLUE}[INFO]${RESET}  $*"; }
-success() { echo -e "${GREEN}[OK]${RESET}    $*"; }
-warn()    { echo -e "${YELLOW}[WARN]${RESET}  $*"; }
-error()   { echo -e "${RED}[ERROR]${RESET} $*" >&2; exit 1; }
-header()  { echo -e "\n${BOLD}${BLUE}═══ $* ═══${RESET}\n"; }
-dry()     { echo -e "${CYAN}[DRY-RUN]${RESET} $*"; }
-would()   { echo -e "${CYAN}  →${RESET} $*"; }
-
-# ─── Parse args ───────────────────────────────────────────────────────────────
-DRY_RUN=false
-for arg in "$@"; do
-  [[ "$arg" == "--dry-run" ]] && DRY_RUN=true
-done
-
-if [[ "$DRY_RUN" == "true" ]]; then
-  echo -e "\n${BOLD}${CYAN}╔═══════════════════════════════════════╗"
-  echo -e "║         DRY-RUN MODE — no changes     ║"
-  echo -e "╚═══════════════════════════════════════╝${RESET}\n"
-fi
-
-# ─── Load .env ────────────────────────────────────────────────────────────────
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-[[ -f "${SCRIPT_DIR}/.env" ]] || error ".env not found. Copy .env.example first."
-set -a
-# shellcheck disable=SC1091
-source "${SCRIPT_DIR}/.env"
-set +a
-
-WEBUI_PORT="${WEBUI_PORT:-3000}"
-WEBUI_URL="http://localhost:${WEBUI_PORT}"
-KHOJ_PORT="${KHOJ_PORT:-42110}"
-KHOJ_URL="http://localhost:${KHOJ_PORT}"
-LITELLM_PORT="${LITELLM_PORT:-4000}"
-LITELLM_URL="http://localhost:${LITELLM_PORT}"
-LITELLM_MASTER_KEY="${LITELLM_MASTER_KEY:-}"
-[[ -n "${LITELLM_MASTER_KEY}" ]] || error "LITELLM_MASTER_KEY must be set in ${SCRIPT_DIR}/.env and must not be empty."
-[[ "${LITELLM_MASTER_KEY}" != "sk-local-admin" ]] || error "LITELLM_MASTER_KEY must not use the default placeholder value 'sk-local-admin'. Set a unique secret in ${SCRIPT_DIR}/.env."
-PIPELINES_API_KEY="${PIPELINES_API_KEY:-changeme}"
-OPEN_TERMINAL_API_KEY="${OPEN_TERMINAL_API_KEY:-changeme}"
-
-# ─── Collect OLLAMA_REMOTE_* entries from .env ────────────────────────────────
-# Same parsing logic as generate-olla-config.sh — single source of truth.
-declare -A REMOTE_INSTANCES   # name → url
-declare -A REMOTE_PRIORITIES  # name → priority (default 70)
-
-while IFS='=' read -r key val; do
-  [[ "$key" =~ ^OLLAMA_REMOTE_([A-Za-z0-9_]+)$ ]] || continue
-  name="${BASH_REMATCH[1]}"
-  val="${val%%#*}"; val="${val#"${val%%[![:space:]]*}"}"; val="${val%"${val##*[![:space:]]}"}"
-  [[ -n "$val" ]] || continue
-  REMOTE_INSTANCES["$name"]="$val"
-  priority_var="OLLAMA_REMOTE_${name}_PRIORITY"
-  REMOTE_PRIORITIES["$name"]="${!priority_var:-70}"
-done < "${SCRIPT_DIR}/.env"
-
-info "Instances from .env:"
-info "  local (arc) → http://ollama-arc:11434  [priority 100]"
-for name in "${!REMOTE_INSTANCES[@]}"; do
-  info "  ${name,,} → ${REMOTE_INSTANCES[$name]}  [priority ${REMOTE_PRIORITIES[$name]}]"
-done
-info "  litellm-cloud → http://litellm:4000  [priority 50]"
-
-# ─── API helper ───────────────────────────────────────────────────────────────
-api() {
-  local method="$1" endpoint="$2" data="${3:-}"
-  local args=(-sf -X "${method}" "${WEBUI_URL}${endpoint}"
-    -H "Content-Type: application/json")
-  [[ -n "${TOKEN:-}" ]] && args+=(-H "Authorization: Bearer ${TOKEN}")
-  [[ -n "$data" ]] && args+=(-d "$data")
-  curl "${args[@]}" 2>/dev/null || echo ""
-}
-
-# ─── Step 1: Generate system_diagnostics.py ───────────────────────────────────
-header "Step 1: Generate System Diagnostics Tool"
-
-TOOL_FILE="${SCRIPT_DIR}/tools/system_diagnostics.py"
-
-if [[ "$DRY_RUN" == "true" ]]; then
-  dry "Would regenerate ${TOOL_FILE} with OLLAMA_INSTANCES from .env:"
-  would "local → http://ollama-arc:11434"
-  for name in "${!REMOTE_INSTANCES[@]}"; do
-    would "${name,,} → ${REMOTE_INSTANCES[$name]}"
-  done
-else
-  [[ -f "$TOOL_FILE" ]] || error "tools/system_diagnostics.py not found. Expected at: ${TOOL_FILE}"
-
-  # Build python assignment statements for remote instances
-  REMOTE_ASSIGNMENTS=""
-  for name in "${!REMOTE_INSTANCES[@]}"; do
-    REMOTE_ASSIGNMENTS+="instances['${name,,}'] = '${REMOTE_INSTANCES[$name]}'"$'\n'
-  done
-
-  python3 - "${TOOL_FILE}" << PYEOF
-import re, sys
-
-tool_path = sys.argv[1]
-
-instances = {"local": "http://ollama-arc:11434"}
-${REMOTE_ASSIGNMENTS}
-
-lines = ['OLLAMA_INSTANCES = {']
-for k, v in instances.items():
-    lines.append(f'    "{k}": "{v}",')
-lines.append('}')
-instances_block = '\n'.join(lines)
-
-with open(tool_path, 'r') as f:
-    content = f.read()
-
-content = re.sub(
-    r'OLLAMA_INSTANCES\s*=\s*\{[^}]*\}',
-    instances_block,
-    content,
-    flags=re.DOTALL
-)
-
-with open(tool_path, 'w') as f:
-    f.write(content)
-
-print(f"Written with {len(instances)} instance(s): {', '.join(instances.keys())}")
-PYEOF
-  success "Generated tools/system_diagnostics.py"
-  info "Tip: commit this file — it's safe to check in (no IPs, just logical names)."
-fi
-
-# ─── Step 2: Wait for Open WebUI ─────────────────────────────────────────────
-header "Step 2: Open WebUI"
-
-if [[ "$DRY_RUN" == "true" ]]; then
-  dry "Would wait for Open WebUI at ${WEBUI_URL}/health"
-else
-  info "Waiting for Open WebUI at ${WEBUI_URL}..."
-  waited=0; max_wait=120
-  while (( waited < max_wait )); do
-    if curl -sf "${WEBUI_URL}/health" 2>/dev/null | grep -q "true"; then
-      success "Open WebUI is ready."
-      break
-    fi
-    sleep 3; (( waited += 3 )) || true
-  done
-  (( waited >= max_wait )) && error "Open WebUI did not become ready. Check: docker logs open-webui"
-fi
-
-# ─── Step 3: Admin account ────────────────────────────────────────────────────
-header "Step 3: Admin Account"
-
-if [[ "$DRY_RUN" == "true" ]]; then
-  dry "Would check /api/v1/auths/admin/config for existing admin account"
-  would "Fresh install: create account (prompt for email / password / name)"
-  would "Existing account: sign in (prompt for credentials)"
-else
-  SETUP_STATUS=$(curl -sf "${WEBUI_URL}/api/v1/auths/admin/config" \
-    -H "Content-Type: application/json" 2>/dev/null || echo "")
-
-  if echo "$SETUP_STATUS" | grep -q '"showAdminDetails"'; then
-    FRESH_INSTALL=true
-  else
-    FRESH_INSTALL=false
-  fi
-
-  if [[ "$FRESH_INSTALL" == "false" ]]; then
-    echo ""
-    echo -e "${YELLOW}An admin account already exists.${RESET}"
-    read -rp "Sign in to existing account? [Y/n] " skip_create
-    if [[ "${skip_create,,}" != "n" ]]; then
-      read -rp  "Admin email:    " ADMIN_EMAIL
-      read -rsp "Admin password: " ADMIN_PASSWORD; echo ""
-      RESPONSE=$(api POST /api/v1/auths/signin \
-        "{\"email\":\"${ADMIN_EMAIL}\",\"password\":\"${ADMIN_PASSWORD}\"}")
-      if echo "$RESPONSE" | grep -q '"token"'; then
-        TOKEN=$(echo "$RESPONSE" | python3 -c "import sys,json; print(json.load(sys.stdin)['token'])")
-        success "Signed in as ${ADMIN_EMAIL}."
-      else
-        error "Sign in failed. Check credentials."
-      fi
-    else
-      FRESH_INSTALL=true
-    fi
-  fi
-
-  if [[ "$FRESH_INSTALL" == "true" ]]; then
-    read -rp  "Admin email:    " ADMIN_EMAIL
-    read -rsp "Admin password: " ADMIN_PASSWORD; echo ""
-    read -rp  "Display name:   " ADMIN_NAME
-    RESPONSE=$(api POST /api/v1/auths/signup \
-      "{\"email\":\"${ADMIN_EMAIL}\",\"password\":\"${ADMIN_PASSWORD}\",\"name\":\"${ADMIN_NAME}\"}")
-    if echo "$RESPONSE" | grep -q '"token"'; then
-      TOKEN=$(echo "$RESPONSE" | python3 -c "import sys,json; print(json.load(sys.stdin)['token'])")
-      success "Admin account created."
-    else
-      error "Could not create admin account. Response: ${RESPONSE:0:200}"
-    fi
-  fi
-fi
-
-# ─── Step 4: Ollama connections ───────────────────────────────────────────────
-header "Step 4: Ollama Connections"
-
-if [[ "$DRY_RUN" == "true" ]]; then
-  dry "Would configure Ollama connections via /ollama/config/update:"
-  would "local → http://ollama-arc:11434  [connection_type: local]"
-  for name in "${!REMOTE_INSTANCES[@]}"; do
-    would "${name,,} → ${REMOTE_INSTANCES[$name]}  [connection_type: external]"
-  done
-  warn "Existing connections not in .env will be removed (rebuilt from scratch each run)."
-else
-  OLLAMA_URLS='["http://ollama-arc:11434"'
-  OLLAMA_API_CFGS='{"0":{"enable":true,"tags":[],"prefix_id":"","model_ids":[],"connection_type":"local","auth_type":"bearer","key":""}}'
-
-  idx=1
-  for name in "${!REMOTE_INSTANCES[@]}"; do
-    url="${REMOTE_INSTANCES[$name]}"
-    OLLAMA_URLS+=",\"${url}\""
-    OLLAMA_API_CFGS=$(echo "$OLLAMA_API_CFGS" | python3 -c "
-import sys, json
-cfgs = json.load(sys.stdin)
-cfgs['${idx}'] = {'enable': True, 'tags': [], 'prefix_id': '',
-                  'model_ids': [], 'connection_type': 'external',
-                  'auth_type': 'bearer', 'key': ''}
-print(json.dumps(cfgs))
-")
-    (( idx++ )) || true
-  done
-  OLLAMA_URLS+="]"
-
-  RESULT=$(curl -sf -X POST "${WEBUI_URL}/ollama/config/update" \
-    -H "Authorization: Bearer ${TOKEN}" \
-    -H "Content-Type: application/json" \
-    -d "{\"ENABLE_OLLAMA_API\":true,\"OLLAMA_BASE_URLS\":${OLLAMA_URLS},\"OLLAMA_API_CONFIGS\":${OLLAMA_API_CFGS}}" \
-    2>/dev/null || echo "")
-
-  if echo "$RESULT" | grep -q "ollama-arc"; then
-    success "Ollama connections configured (1 local + ${#REMOTE_INSTANCES[@]} remote)."
-  else
-    warn "Could not update Ollama connections — set manually in Admin Panel → Connections."
-  fi
-fi
-
-# ─── Step 5: Pipelines connection ─────────────────────────────────────────────
-header "Step 5: Pipelines"
-
-if [[ "$DRY_RUN" == "true" ]]; then
-  dry "Would add Pipelines to OpenAI connections if not already present:"
-  would "http://pipelines:9099  key: PIPELINES_API_KEY"
-else
-  OPENAI_CFG=$(curl -sf "${WEBUI_URL}/openai/config" \
-    -H "Authorization: Bearer ${TOKEN}" 2>/dev/null || echo "{}")
-
-  RESULT=$(echo "$OPENAI_CFG" | python3 -c "
-import sys, json
-cfg = json.load(sys.stdin)
-urls = cfg.get('OPENAI_API_BASE_URLS', [])
-keys = cfg.get('OPENAI_API_KEYS', [])
-api_cfgs = cfg.get('OPENAI_API_CONFIGS', {})
-url  = 'http://pipelines:9099'
-key  = '${PIPELINES_API_KEY}'
-if url not in urls:
-    idx = str(len(urls))
-    urls.append(url); keys.append(key)
-    api_cfgs[idx] = {'enable': True, 'tags': [], 'prefix_id': '',
-                     'model_ids': [], 'connection_type': 'external', 'auth_type': 'bearer'}
-cfg.update({'OPENAI_API_BASE_URLS': urls, 'OPENAI_API_KEYS': keys, 'OPENAI_API_CONFIGS': api_cfgs})
-print(json.dumps(cfg))
-" | curl -sf -X POST "${WEBUI_URL}/openai/config/update" \
-    -H "Authorization: Bearer ${TOKEN}" \
-    -H "Content-Type: application/json" \
-    -d @- 2>/dev/null || echo "")
-
-  if echo "$RESULT" | grep -q "pipelines"; then
-    success "Pipelines connection configured."
-  else
-    warn "Could not configure Pipelines — set manually in Admin Panel → Connections."
-  fi
-fi
-
-# ─── Step 6: LiteLLM connection (Claude, Gemini) ─────────────────────────────
-header "Step 6: LiteLLM Cloud Connection"
-
-if [[ "$DRY_RUN" == "true" ]]; then
-  dry "Would add LiteLLM to OpenAI connections if not already present:"
-  would "http://litellm:4000/v1  key: LITELLM_MASTER_KEY"
-  would "Exposes Claude, Gemini models directly in Open WebUI model picker."
-else
-  OPENAI_CFG=$(curl -sf "${WEBUI_URL}/openai/config" \
-    -H "Authorization: Bearer ${TOKEN}" 2>/dev/null || echo "{}")
-
-  RESULT=$(echo "$OPENAI_CFG" | python3 -c "
-import sys, json
-cfg = json.load(sys.stdin)
-urls = cfg.get('OPENAI_API_BASE_URLS', [])
-keys = cfg.get('OPENAI_API_KEYS', [])
-api_cfgs = cfg.get('OPENAI_API_CONFIGS', {})
-url  = 'http://litellm:4000/v1'
-key  = '${LITELLM_MASTER_KEY}'
-if url not in urls:
-    idx = str(len(urls))
-    urls.append(url); keys.append(key)
-    api_cfgs[idx] = {'enable': True, 'tags': [], 'prefix_id': 'cloud',
-                     'model_ids': [], 'connection_type': 'external', 'auth_type': 'bearer'}
-cfg.update({'OPENAI_API_BASE_URLS': urls, 'OPENAI_API_KEYS': keys, 'OPENAI_API_CONFIGS': api_cfgs})
-print(json.dumps(cfg))
-" | curl -sf -X POST "${WEBUI_URL}/openai/config/update" \
-    -H "Authorization: Bearer ${TOKEN}" \
-    -H "Content-Type: application/json" \
-    -d @- 2>/dev/null || echo "")
-
-  if echo "$RESULT" | grep -q "litellm"; then
-    success "LiteLLM connection configured — Claude and Gemini models now available."
-  else
-    warn "Could not configure LiteLLM — set manually in Admin Panel → Connections."
-    warn "  URL: http://litellm:4000/v1    Key: ${LITELLM_MASTER_KEY}"
-  fi
-fi
-
-# ─── Step 7: Open Terminal ────────────────────────────────────────────────────
-header "Step 7: Open Terminal"
-
-if [[ "$DRY_RUN" == "true" ]]; then
-  dry "Would check /api/v1/terminals/ for existing Open Terminal entry:"
-  would "If not present: add http://open-terminal:8000"
-  would "If already present: skip (idempotent)."
-else
-  EXISTING_TERMS=$(curl -sf "${WEBUI_URL}/api/v1/terminals/" \
-    -H "Authorization: Bearer ${TOKEN}" 2>/dev/null || echo "[]")
-
-  if echo "$EXISTING_TERMS" | grep -q "open-terminal"; then
-    success "Open Terminal already configured — no change."
-  else
-    RESULT=$(curl -sf -X POST "${WEBUI_URL}/api/v1/terminals/add" \
-      -H "Authorization: Bearer ${TOKEN}" \
-      -H "Content-Type: application/json" \
-      -d "{\"url\":\"http://open-terminal:8000\",\"name\":\"Local\",\"key\":\"${OPEN_TERMINAL_API_KEY}\"}" \
-      2>/dev/null || echo "")
-    if echo "$RESULT" | grep -q '"id"'; then
-      success "Open Terminal configured."
-    else
-      warn "Could not configure Open Terminal — set manually in Integrations."
-    fi
-  fi
-fi
-
-# ─── Step 8: Deploy System Diagnostics tool ───────────────────────────────────
-header "Step 8: Deploy System Diagnostics Tool"
-
-if [[ "$DRY_RUN" == "true" ]]; then
-  dry "Would deploy tools/system_diagnostics.py to Open WebUI:"
-  would "If tool exists (id: system_diagnostics): update content"
-  would "If tool missing: create new"
-else
-  if [[ ! -f "$TOOL_FILE" ]]; then
-    warn "tools/system_diagnostics.py not found — skipping tool deployment."
-  else
-    PAYLOAD=$(python3 -c "
-import json
-print(json.dumps({
-  'id': 'system_diagnostics',
-  'name': 'System Diagnostics',
-  'content': open('${TOOL_FILE}').read(),
-  'meta': {'description': 'Query multiple Ollama instances for models, GPU status, health, and control.'}
-}))
-")
-    EXISTING_TOOLS=$(api GET /api/v1/tools/)
-
-    if echo "$EXISTING_TOOLS" | grep -qi "system_diagnostics"; then
-      RESULT=$(api POST /api/v1/tools/id/system_diagnostics/update "$PAYLOAD" 2>/dev/null || echo "")
-      if echo "$RESULT" | grep -q '"id"'; then
-        success "System Diagnostics tool updated."
-      else
-        warn "Could not update tool via API — paste tools/system_diagnostics.py manually in Workspace → Tools."
-      fi
-    else
-      RESULT=$(api POST /api/v1/tools/create "$PAYLOAD")
-      if echo "$RESULT" | grep -q '"id"'; then
-        success "System Diagnostics tool installed."
-      else
-        warn "Could not install tool — paste tools/system_diagnostics.py manually in Workspace → Tools."
-      fi
-    fi
-  fi
-fi
-
-# ─── Step 9: Enable tools on models ───────────────────────────────────────────
-header "Step 9: Enable Tools on Models"
-
-if [[ "$DRY_RUN" == "true" ]]; then
-  dry "Would fetch model list from /api/v1/models:"
-  would "For each model without system_diagnostics in toolIds: add it"
-  would "Models already configured and embed/pipeline models: skip"
-else
-  CUSTOM_MODELS=$(curl -sf "${WEBUI_URL}/api/v1/models" \
-    -H "Authorization: Bearer ${TOKEN}" 2>/dev/null || echo "[]")
-
-  MODEL_IDS=$(echo "$CUSTOM_MODELS" | python3 -c "
-import sys, json
-models = json.load(sys.stdin)
-if isinstance(models, dict): models = models.get('data', [])
-for m in models:
-  mid = m.get('id','')
-  existing = m.get('meta', {}).get('toolIds', [])
-  skip = any(s in mid for s in ['embed','smart-router','pipeline'])
-  if 'system_diagnostics' not in existing and not skip:
-    print(mid)
-" 2>/dev/null || echo "")
-
-  if [[ -z "$MODEL_IDS" ]]; then
-    success "All models already have System Diagnostics enabled — no change."
-  else
-    while IFS= read -r model_id; do
-      [[ -z "$model_id" ]] && continue
-      CURRENT_MODEL=$(curl -sf \
-        "${WEBUI_URL}/api/v1/models/model?id=${model_id}" \
-        -H "Authorization: Bearer ${TOKEN}" 2>/dev/null || echo "{}")
-      RESULT=$(echo "$CURRENT_MODEL" | python3 -c "
-import sys, json
-m = json.load(sys.stdin)
-meta = m.get('meta', {})
-tools = meta.get('toolIds', [])
-if 'system_diagnostics' not in tools: tools.append('system_diagnostics')
-meta['toolIds'] = tools
-m['meta'] = meta
-print(json.dumps({'id': m.get('id'), 'name': m.get('name'),
-                  'meta': meta, 'params': m.get('params', {})}))
-" | curl -sf -X POST \
-        "${WEBUI_URL}/api/v1/models/model/update?id=${model_id}" \
-        -H "Authorization: Bearer ${TOKEN}" \
-        -H "Content-Type: application/json" \
-        -d @- 2>/dev/null || echo "")
-      if echo "$RESULT" | grep -q '"id"'; then
-        success "Enabled System Diagnostics on: ${model_id}"
-      else
-        # Model has no custom entry yet (e.g. LiteLLM proxy models) — create one
-        RESULT=$(MODEL_ID="$model_id" python3 -c "
-import json
-import os
-model_id = os.environ['MODEL_ID']
-print(json.dumps({
-  'id': model_id,
-  'base_model_id': model_id,
-  'name': model_id,
-  'meta': {'toolIds': ['system_diagnostics']},
-  'params': {}
-}))
-" | curl -sf -X POST \
-          "${WEBUI_URL}/api/v1/models/create" \
-          -H "Authorization: Bearer ${TOKEN}" \
-          -H "Content-Type: application/json" \
-          -d @- 2>/dev/null || echo "")
-        if echo "$RESULT" | grep -q '"id"'; then
-          success "Enabled System Diagnostics on: ${model_id}"
-        else
-          warn "Could not update: ${model_id}"
-        fi
-      fi
-    done <<< "$MODEL_IDS"
-  fi
-fi
-
-# ─── Step 10: Verify Khoj ─────────────────────────────────────────────────────
-header "Step 10: Khoj"
-
-if [[ "$DRY_RUN" == "true" ]]; then
-  dry "Would wait for Khoj at ${KHOJ_URL}/api/health"
-  would "On success: attempt to retrieve API key and print Obsidian plugin instructions"
-  would "On timeout: print troubleshooting steps"
-else
-  info "Waiting for Khoj at ${KHOJ_URL}..."
-  khoj_ready=false
-  for i in {1..15}; do
-    if curl -sf "${KHOJ_URL}/api/health" 2>/dev/null | grep -qi "ok"; then
-      khoj_ready=true; break
-    fi
-    sleep 4
-  done
-
-  if [[ "$khoj_ready" == "true" ]]; then
-    success "Khoj is healthy."
-
-    KHOJ_API_KEY=$(curl -sf -X POST "${KHOJ_URL}/auth/token" \
-      -H "Content-Type: application/json" \
-      -d "{\"username\":\"${KHOJ_ADMIN_EMAIL:-admin@localhost}\",\"password\":\"${KHOJ_ADMIN_PASSWORD:-changeme}\"}" \
-      2>/dev/null | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('token',''))" 2>/dev/null || echo "")
-
-    echo ""
-    echo -e "${GREEN}${BOLD}Khoj is ready!${RESET}"
-    echo ""
-    echo -e "${YELLOW}Obsidian plugin setup:${RESET}"
-    echo "  1. Obsidian → Settings → Community Plugins → Browse → search 'Khoj' → Install"
-    echo "  2. In Khoj plugin settings:"
-    echo -e "       Server URL: ${BOLD}${KHOJ_URL}${RESET}"
-    if [[ -n "$KHOJ_API_KEY" ]]; then
-      echo -e "       API Key:    ${BOLD}${KHOJ_API_KEY}${RESET}"
-    else
-      echo "       API Key:    get from ${KHOJ_URL}/settings (admin credentials)"
-    fi
-    echo "  3. Click 'Force Sync' to index your vault immediately"
-    echo ""
-    echo -e "  Full guide: ${BOLD}docs/khoj-setup.md${RESET}"
-  else
-    warn "Khoj did not become ready within 60s."
-    warn "  Check: docker logs khoj"
-    warn "  Khoj requires nomic-embed-text — ensure the model is pulled in Ollama."
-    warn "  Manual setup: see docs/khoj-setup.md"
-  fi
-fi
-
-# ─── Step 11: Configure Khoj chat models ──────────────────────────────────────
-header "Step 11: Khoj Chat Models"
-
-if [[ "$DRY_RUN" == "true" ]]; then
-  dry "Would configure Khoj chat models via Django shell inside khoj container:"
-  would "Create AiModelApi entry: ollama → http://ollama-arc:11434/v1/"
-  would "Register: gemma3:12b, qwen2.5:14b, qwen2.5-coder:14b, deepseek-r1:14b"
-  would "Existing entries skipped (get_or_create — idempotent)."
-else
-  if ! curl -sf "${KHOJ_URL}/api/health" 2>/dev/null | grep -qi "ok"; then
-    warn "Khoj not reachable — skipping model setup."
-    warn "Re-run post-install.sh once Khoj is healthy, or configure at ${KHOJ_URL}/server/admin"
-  else
-    docker exec khoj bash -c 'cat > /tmp/setup_models.py << '"'"'PYEOF'"'"'
-import os
-os.environ.setdefault("DJANGO_SETTINGS_MODULE", "khoj.app.settings")
-import django
-django.setup()
-from khoj.database.models import ChatModel, AiModelApi
-
-api, created = AiModelApi.objects.get_or_create(
-    name="ollama",
-    defaults={"api_key": "ollama", "api_base_url": "http://ollama-arc:11434/v1/"}
-)
-print("AiModelApi:", "created" if created else "exists", api.name)
-
-for name, friendly, strengths in [
-    ("gemma4:27b",        "Gemma 4 27B",        "Heavy lifting, large context"),
-    ("mistral-small3.2:24b", "Mistral Small 3.2 24B", "Strong function calling, 128K context"),
-    ("qwen3.5:14b",       "Qwen 3.5 14B",       "Improved reasoning, tool calling"),
-    ("gemma3:12b",        "Gemma 3 12B",        "Long context, logs, summaries"),
-    ("qwen2.5:14b",       "Qwen 2.5 14B",       "Tool calling, diagnostics"),
-    ("qwen2.5-coder:14b", "Qwen 2.5 Coder 14B", "Code, configs, scripting"),
-    ("deepseek-r1:14b",   "DeepSeek R1 14B",    "Complex reasoning"),
-]:
-    obj, c = ChatModel.objects.get_or_create(
-        name=name,
-        defaults={
-            "friendly_name": friendly,
-            "model_type": ChatModel.ModelType.OPENAI,
-            "ai_model_api": api,
-            "strengths": strengths,
-            "max_prompt_size": 8192,
-        }
-    )
-    print("ChatModel:", "created" if c else "exists", obj.name)
-print("Done!")
-PYEOF'
-
-    RESULT=$(docker exec khoj python3 /tmp/setup_models.py 2>&1)
-    docker exec khoj rm -f /tmp/setup_models.py
-
-    if echo "$RESULT" | grep -q "Done!"; then
-      echo "$RESULT" | while IFS= read -r line; do info "  $line"; done
-      success "Khoj chat models configured."
-    else
-      warn "Khoj model setup may have failed:"
-      echo "$RESULT"
-      warn "Configure manually at ${KHOJ_URL}/server/admin"
-    fi
-  fi
-fi
-
-# ─── Summary ──────────────────────────────────────────────────────────────────
-header "Summary"
-
-if [[ "$DRY_RUN" == "true" ]]; then
-  echo -e "${CYAN}${BOLD}Dry-run complete — no changes were made.${RESET}"
-  echo ""
-  echo -e "${CYAN}Planned instance registrations:${RESET}"
-  echo "  local (arc) → http://ollama-arc:11434"
-  for name in "${!REMOTE_INSTANCES[@]}"; do
-    echo "  ${name,,} → ${REMOTE_INSTANCES[$name]}"
-  done
-  echo "  litellm-cloud → http://litellm:4000/v1  (Claude, Gemini)"
-  echo ""
-  echo -e "Run ${BOLD}./post-install.sh${RESET} to apply."
-else
-  echo -e "${GREEN}${BOLD}Configuration applied!${RESET}"
-  echo ""
-  echo -e "  Open WebUI:  ${BOLD}${WEBUI_URL}${RESET}"
-  echo -e "  LiteLLM UI:  ${BOLD}${LITELLM_URL}/ui${RESET}  (admin / ${LITELLM_MASTER_KEY})"
-  echo -e "  Khoj:        ${BOLD}${KHOJ_URL}${RESET}"
-  echo ""
-  echo -e "${YELLOW}Registered instances:${RESET}"
-  echo "  local (arc) → http://ollama-arc:11434"
-  for name in "${!REMOTE_INSTANCES[@]}"; do
-    echo "  ${name,,} → ${REMOTE_INSTANCES[$name]}"
-  done
-  echo "  litellm-cloud → http://litellm:4000/v1  (Claude, Gemini)"
-  echo ""
-  echo -e "${YELLOW}tools/system_diagnostics.py was regenerated — safe to commit:${RESET}"
-  echo -e "  ${BOLD}git add tools/ && git commit -m 'update instances'${RESET}"
-  echo ""
-  echo -e "${YELLOW}Any [WARN] steps need manual completion — see:${RESET}"
-  echo "  docs/post-install.md and docs/khoj-setup.md"
-fi
diff --git a/proxy/example.olla.yaml b/proxy/example.olla.yaml
deleted file mode 100644
index 96bded8..0000000
--- a/proxy/example.olla.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-# olla.yaml — Olla proxy configuration
-# Docs: https://thushan.github.io/olla/
-#
-# Routing priority (highest first):
-#   100 — ollama-arc     (local Intel Arc iGPU, this machine)
-#    75 — remote LAN nodes (commented — add your other machines here)
-#    50 — litellm-cloud  (Claude, Gemini — cloud fallback / explicit requests)
-
-server:
-  host: "0.0.0.0"
-  port: 40114
-  request_logging: true
-  write_timeout: 600s       # generous for large local models
-
-proxy:
-  engine: "sherpa"          # swap to "olla" for circuit breakers + connection pooling
-  load_balancer: "least-connections"
-  response_timeout: 600s
-  connection_timeout: 30s
-
-discovery:
-  type: "static"
-  static:
-    endpoints:
-
-      # ── Primary: local Intel Arc node (this machine) ──────────────────────
-      - url: "http://ollama-arc:11434"
-        name: "ollama-arc-local"
-        type: "ollama"
-        priority: 100
-        check_interval: 15s
-        check_timeout: 5s
-
-      # ── Optional: additional LAN Ollama nodes ─────────────────────────────
-      # Uncomment and set real IPs for any other machines running Ollama.
-      # Each remote machine needs OLLAMA_HOST=0.0.0.0 in its environment.
-      #
-      # - url: "http://192.168.1.50:11434"
-      #   name: "workstation"
-      #   type: "ollama"
-      #   priority: 75
-      #   check_interval: 15s
-      #   check_timeout: 5s
-      #
-      # - url: "http://192.168.1.51:11434"
-      #   name: "makerspace-server"
-      #   type: "ollama"
-      #   priority: 60
-      #   check_interval: 20s
-      #   check_timeout: 5s
-
-      # ── Cloud gateway via LiteLLM (Claude, Gemini) ────────────────────────
-      # Lower priority — used when local nodes are busy/down, or when
-      # a cloud model (claude-*, gemini-*) is explicitly requested.
-      - url: "http://litellm:4000"
-        name: "litellm-cloud"
-        type: "litellm"
-        priority: 50
-        check_interval: 30s
-        check_timeout: 10s
-
-  model_discovery:
-    enabled: true
-    interval: 5m            # refresh available model lists every 5 minutes
-
-logging:
-  level: "info"
-  format: "text"
diff --git a/retriever/Dockerfile b/retriever/Dockerfile
new file mode 100644
index 0000000..8de3b05
--- /dev/null
+++ b/retriever/Dockerfile
@@ -0,0 +1,6 @@
+FROM python:3.12-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "42000"]
diff --git a/retriever/indexer.py b/retriever/indexer.py
new file mode 100644
index 0000000..5aef528
--- /dev/null
+++ b/retriever/indexer.py
@@ -0,0 +1,216 @@
+import os
+import time
+import re
+import threading
+import httpx
+import numpy as np
+from watchdog.observers import Observer
+from watchdog.events import FileSystemEventHandler
+from search import store_chunks, delete_file_chunks, rebuild_fts, indexed_file_count
+
+VAULT_PATH = os.environ.get("VAULT_PATH", "/vault")
+OLLA_URL = os.environ.get("OLLA_URL", "http://olla:40114")
+EMBED_MODEL = os.environ.get("EMBED_MODEL", "nomic-embed-text")
+CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", "512"))
+CHUNK_OVERLAP = int(os.environ.get("CHUNK_OVERLAP", "64"))
+
+_watcher_observer: Observer | None = None
+_indexing = False
+
+
+def chunk_markdown(text: str, filepath: str) -> list[dict]:
+    lines = text.split("\n")
+    chunks = []
+    current_section = []
+    parent_headings: list[str] = []
+    chunk_index = 0
+
+    for line in lines:
+        heading_match = re.match(r"^(#{1,6})\s+(.+)", line)
+        if heading_match:
+            if current_section:
+                content = "\n".join(current_section).strip()
+                if content:
+                    ctx = " > ".join(parent_headings) if parent_headings else ""
+                    prefix = f"# {ctx}\n\n" if ctx else ""
+                    chunks.append({
+                        "filepath": filepath,
+                        "chunk_index": chunk_index,
+                        "content": prefix + content,
+                        "parent_heading": parent_headings[-1] if parent_headings else "",
+                    })
+                    chunk_index += 1
+                current_section = []
+            level = len(heading_match.group(1))
+            heading_text = heading_match.group(2).strip()
+            parent_headings = parent_headings[: level - 1] + [heading_text]
+            current_section.append(line)
+        else:
+            current_section.append(line)
+
+    if current_section:
+        content = "\n".join(current_section).strip()
+        if content:
+            ctx = " > ".join(parent_headings) if parent_headings else ""
+            prefix = f"# {ctx}\n\n" if ctx else ""
+            chunks.append({
+                "filepath": filepath,
+                "chunk_index": chunk_index,
+                "content": prefix + content,
+                "parent_heading": parent_headings[-1] if parent_headings else "",
+            })
+
+    # Sub-chunk long sections
+    final_chunks = []
+    for c in chunks:
+        if len(c["content"]) > CHUNK_SIZE:
+            sub_chunks = sub_chunk_text(c["content"], c["filepath"], c["parent_heading"])
+            final_chunks.extend(sub_chunks)
+        else:
+            final_chunks.append(c)
+
+    # Re-index chunk indices
+    for i, c in enumerate(final_chunks):
+        c["chunk_index"] = i
+
+    return final_chunks
+
+
+def sub_chunk_text(text: str, filepath: str, parent_heading: str) -> list[dict]:
+    paragraphs = re.split(r"\n\s*\n", text)
+    chunks = []
+    current = []
+    current_len = 0
+    for para in paragraphs:
+        para = para.strip()
+        if not para:
+            continue
+        if current_len + len(para) > CHUNK_SIZE and current:
+            chunks.append("\n\n".join(current))
+            current = []
+            current_len = 0
+        current.append(para)
+        current_len += len(para)
+    if current:
+        chunks.append("\n\n".join(current))
+
+    return [
+        {
+            "filepath": filepath,
+            "chunk_index": i,
+            "content": c,
+            "parent_heading": parent_heading,
+        }
+        for i, c in enumerate(chunks)
+    ]
+
+
+async def embed_text(text: str) -> np.ndarray | None:
+    try:
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            resp = await client.post(
+                f"{OLLA_URL}/olla/ollama/api/embeddings",
+                json={"model": EMBED_MODEL, "prompt": text},
+            )
+            resp.raise_for_status()
+            data = resp.json()
+            return np.array(data["embedding"], dtype=np.float32)
+    except Exception:
+        return None
+
+
+def embed_text_sync(text: str) -> np.ndarray | None:
+    try:
+        with httpx.Client(timeout=30.0) as client:
+            resp = client.post(
+                f"{OLLA_URL}/olla/ollama/api/embeddings",
+                json={"model": EMBED_MODEL, "prompt": text},
+            )
+            resp.raise_for_status()
+            data = resp.json()
+            return np.array(data["embedding"], dtype=np.float32)
+    except Exception:
+        return None
+
+
+def index_file(filepath: str) -> int:
+    abs_path = os.path.join(VAULT_PATH, filepath) if not filepath.startswith("/") else filepath
+    if not os.path.isfile(abs_path):
+        return 0
+    try:
+        with open(abs_path, "r", encoding="utf-8", errors="replace") as f:
+            text = f.read()
+    except Exception:
+        return 0
+
+    rel_path = os.path.relpath(abs_path, VAULT_PATH)
+    chunks = chunk_markdown(text, rel_path)
+
+    for c in chunks:
+        emb = embed_text_sync(c["content"])
+        if emb is not None:
+            c["embedding"] = emb.astype(np.float32).tobytes()
+        else:
+            c["embedding"] = None
+
+    store_chunks(chunks)
+    return len(chunks)
+
+
+def scan_vault():
+    global _indexing
+    _indexing = True
+    total = 0
+    for root, _dirs, files in os.walk(VAULT_PATH):
+        for fname in files:
+            if not fname.endswith(".md"):
+                continue
+            fpath = os.path.join(root, fname)
+            count = index_file(fpath)
+            if count:
+                total += count
+    rebuild_fts()
+    _indexing = False
+    return total
+
+
+class VaultHandler(FileSystemEventHandler):
+    def on_modified(self, event):
+        if event.is_directory or not event.src_path.endswith(".md"):
+            return
+        delete_file_chunks(os.path.relpath(event.src_path, VAULT_PATH))
+        index_file(event.src_path)
+        rebuild_fts()
+
+    def on_created(self, event):
+        if event.is_directory or not event.src_path.endswith(".md"):
+            return
+        index_file(event.src_path)
+        rebuild_fts()
+
+    def on_deleted(self, event):
+        if event.is_directory or not event.src_path.endswith(".md"):
+            return
+        delete_file_chunks(os.path.relpath(event.src_path, VAULT_PATH))
+
+
+def start_watcher():
+    global _watcher_observer
+    if _watcher_observer:
+        return
+    _watcher_observer = Observer()
+    handler = VaultHandler()
+    _watcher_observer.schedule(handler, VAULT_PATH, recursive=True)
+    _watcher_observer.start()
+
+
+def stop_watcher():
+    global _watcher_observer
+    if _watcher_observer:
+        _watcher_observer.stop()
+        _watcher_observer.join()
+        _watcher_observer = None
+
+
+def is_indexing() -> bool:
+    return _indexing
diff --git a/retriever/main.py b/retriever/main.py
new file mode 100644
index 0000000..0d90387
--- /dev/null
+++ b/retriever/main.py
@@ -0,0 +1,85 @@
+import os
+import threading
+from contextlib import asynccontextmanager
+from fastapi import FastAPI
+from pydantic import BaseModel
+
+from search import setup_db, hybrid_search, indexed_file_count, total_chunk_count, rebuild_fts
+from indexer import scan_vault, start_watcher, stop_watcher, embed_text, is_indexing, VAULT_PATH
+
+
+class SearchRequest(BaseModel):
+    query: str
+    top_k: int = 10
+
+
+class SearchResult(BaseModel):
+    filepath: str
+    chunk_index: int
+    content: str
+    parent_heading: str
+    score: float
+
+
+class SearchResponse(BaseModel):
+    results: list[SearchResult]
+
+
+class HealthResponse(BaseModel):
+    status: str
+    indexed_files: int
+    total_chunks: int
+    vault_watching: bool
+    vault_path: str
+    is_indexing: bool
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    setup_db()
+    if os.path.isdir(VAULT_PATH):
+        scan_thread = threading.Thread(target=run_initial_scan, daemon=True)
+        scan_thread.start()
+    yield
+    stop_watcher()
+
+
+def run_initial_scan():
+    scan_vault()
+    rebuild_fts()
+    start_watcher()
+
+
+app = FastAPI(title="Retriever", version="0.1.0", lifespan=lifespan)
+
+
+@app.get("/health", response_model=HealthResponse)
+async def health():
+    return HealthResponse(
+        status="ok",
+        indexed_files=indexed_file_count(),
+        total_chunks=total_chunk_count(),
+        vault_watching=True,
+        vault_path=VAULT_PATH,
+        is_indexing=is_indexing(),
+    )
+
+
+@app.post("/search", response_model=SearchResponse)
+async def search(req: SearchRequest):
+    emb = await embed_text(req.query)
+    if emb is None:
+        return SearchResponse(results=[])
+    results = hybrid_search(req.query, emb, top_k=req.top_k)
+    return SearchResponse(results=[SearchResult(**r) for r in results])
+
+
+@app.post("/reindex")
+async def reindex():
+    from search import get_db
+    db = get_db()
+    db.execute("DELETE FROM documents")
+    db.commit()
+    db.close()
+    threading.Thread(target=lambda: (scan_vault(), rebuild_fts()), daemon=True).start()
+    return {"status": "reindexing"}
diff --git a/retriever/requirements.txt b/retriever/requirements.txt
new file mode 100644
index 0000000..0498229
--- /dev/null
+++ b/retriever/requirements.txt
@@ -0,0 +1,5 @@
+fastapi>=0.115.0
+uvicorn[standard]>=0.34.0
+httpx>=0.28.0
+watchdog>=6.0.0
+numpy>=1.26.0
diff --git a/retriever/search.py b/retriever/search.py
new file mode 100644
index 0000000..4fe626a
--- /dev/null
+++ b/retriever/search.py
@@ -0,0 +1,175 @@
+import sqlite3
+import numpy as np
+import os
+
+DB_PATH = os.environ.get("DB_PATH", "/data/retriever.db")
+
+
+def get_db() -> sqlite3.Connection:
+    db = sqlite3.connect(DB_PATH)
+    db.row_factory = sqlite3.Row
+    return db
+
+
+def setup_db():
+    db = get_db()
+    db.execute("""
+        CREATE TABLE IF NOT EXISTS documents (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            filepath TEXT NOT NULL,
+            chunk_index INTEGER NOT NULL,
+            content TEXT NOT NULL,
+            parent_heading TEXT DEFAULT '',
+            embedding BLOB,
+            UNIQUE(filepath, chunk_index)
+        )
+    """)
+    db.execute("""
+        CREATE VIRTUAL TABLE IF NOT EXISTS fts_documents USING fts5(
+            filepath, content, parent_heading,
+            content=documents, content_rowid=id
+        )
+    """)
+    db.execute("PRAGMA journal_mode=WAL")
+    db.commit()
+    db.close()
+
+
+def rebuild_fts():
+    db = get_db()
+    db.execute("INSERT INTO fts_documents(fts_documents) VALUES('rebuild')")
+    db.commit()
+    db.close()
+
+
+def store_chunks(chunks: list[dict]):
+    db = get_db()
+    for c in chunks:
+        embedding_bytes = c.get("embedding")
+        if embedding_bytes is not None and isinstance(embedding_bytes, np.ndarray):
+            embedding_bytes = embedding_bytes.astype(np.float32).tobytes()
+        db.execute(
+            """INSERT OR REPLACE INTO documents (filepath, chunk_index, content, parent_heading, embedding)
+               VALUES (?, ?, ?, ?, ?)""",
+            (c["filepath"], c["chunk_index"], c["content"], c.get("parent_heading", ""), embedding_bytes),
+        )
+    db.commit()
+    db.close()
+
+
+def delete_file_chunks(filepath: str):
+    db = get_db()
+    db.execute("DELETE FROM documents WHERE filepath = ?", (filepath,))
+    db.commit()
+    db.close()
+
+
+def get_all_embeddings():
+    db = get_db()
+    rows = db.execute(
+        "SELECT id, filepath, chunk_index, content, parent_heading, embedding FROM documents WHERE embedding IS NOT NULL"
+    ).fetchall()
+    db.close()
+    result = []
+    for r in rows:
+        emb = np.frombuffer(r["embedding"], dtype=np.float32) if r["embedding"] else None
+        result.append({
+            "id": r["id"],
+            "filepath": r["filepath"],
+            "chunk_index": r["chunk_index"],
+            "content": r["content"],
+            "parent_heading": r["parent_heading"],
+            "embedding": emb,
+        })
+    return result
+
+
+def search_keyword(query: str, limit: int = 20) -> list[dict]:
+    db = get_db()
+    # Escape FTS5 special characters
+    query_safe = " ".join(word for word in query.split())
+    try:
+        rows = db.execute(
+            """SELECT id, filepath, chunk_index, content, parent_heading, rank
+               FROM fts_documents
+               WHERE fts_documents MATCH ?
+               ORDER BY rank
+               LIMIT ?""",
+            (query_safe, limit),
+        ).fetchall()
+    except sqlite3.OperationalError:
+        rows = []
+    db.close()
+    return [
+        {
+            "id": r["id"],
+            "filepath": r["filepath"],
+            "chunk_index": r["chunk_index"],
+            "content": r["content"],
+            "parent_heading": r["parent_heading"],
+            "score": 1.0 / (1.0 + abs(r["rank"])),
+        }
+        for r in rows
+    ]
+
+
+def search_vector(query_embedding: np.ndarray, top_k: int = 20) -> list[dict]:
+    all_docs = get_all_embeddings()
+    if not all_docs:
+        return []
+    query_norm = query_embedding / (np.linalg.norm(query_embedding) + 1e-10)
+    scored = []
+    for d in all_docs:
+        if d["embedding"] is None:
+            continue
+        doc_norm = d["embedding"] / (np.linalg.norm(d["embedding"]) + 1e-10)
+        sim = float(np.dot(query_norm, doc_norm))
+        scored.append((sim, d))
+    scored.sort(key=lambda x: x[0], reverse=True)
+    return [
+        {
+            "id": d["id"],
+            "filepath": d["filepath"],
+            "chunk_index": d["chunk_index"],
+            "content": d["content"],
+            "parent_heading": d["parent_heading"],
+            "score": sim,
+        }
+        for sim, d in scored[:top_k]
+    ]
+
+
+def hybrid_search(query: str, query_embedding: np.ndarray, top_k: int = 10) -> list[dict]:
+    kw_results = search_keyword(query, limit=top_k * 2)
+    vec_results = search_vector(query_embedding, top_k=top_k * 2)
+    # Reciprocal rank fusion
+    rrf_k = 60
+    scores: dict[int, float] = {}
+    seen: dict[int, dict] = {}
+    for rank, r in enumerate(kw_results):
+        doc_id = r["id"]
+        scores[doc_id] = scores.get(doc_id, 0.0) + 1.0 / (rrf_k + rank)
+        seen[doc_id] = r
+    for rank, r in enumerate(vec_results):
+        doc_id = r["id"]
+        scores[doc_id] = scores.get(doc_id, 0.0) + 1.0 / (rrf_k + rank)
+        seen[doc_id] = r
+    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
+    return [
+        {**seen[doc_id], "score": round(rrf_score, 4)}
+        for doc_id, rrf_score in ranked[:top_k]
+    ]
+
+
+def indexed_file_count() -> int:
+    db = get_db()
+    count = db.execute("SELECT COUNT(DISTINCT filepath) FROM documents").fetchone()[0]
+    db.close()
+    return count
+
+
+def total_chunk_count() -> int:
+    db = get_db()
+    count = db.execute("SELECT COUNT(*) FROM documents").fetchone()[0]
+    db.close()
+    return count
diff --git a/router/Dockerfile b/router/Dockerfile
new file mode 100644
index 0000000..f5ce68b
--- /dev/null
+++ b/router/Dockerfile
@@ -0,0 +1,12 @@
+FROM python:3.12-slim
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY smart-model-router.py .
+
+EXPOSE 40115
+
+CMD ["python", "smart-model-router.py"]
diff --git a/router/requirements.txt b/router/requirements.txt
new file mode 100644
index 0000000..ed617b5
--- /dev/null
+++ b/router/requirements.txt
@@ -0,0 +1,3 @@
+fastapi>=0.115.0
+uvicorn[standard]>=0.34.0
+httpx>=0.28.0
diff --git a/router/smart-model-router.py b/router/smart-model-router.py
new file mode 100644
index 0000000..a4e9cca
--- /dev/null
+++ b/router/smart-model-router.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+"""
+Smart Model Router for Olla
+Auto-routes queries to the best local model based on content analysis.
+Sits between OpenCode and Olla: OpenCode -> Smart Router -> Olla -> ollama-arc
+
+Routing:
+- Diagnostics      -> qwen2.5:14b      (fast, reliable for sysadmin)
+- Scripting/Code   -> qwen2.5-coder:14b (code generation)
+- Reasoning        -> deepseek-r1:14b   (chain-of-thought)
+- Longform/Logs    -> gemma3:12b        (long context, summaries)
+- Heavy lifting    -> gemma4:27b        (complex analysis, large context)
+- Tool calling     -> mistral-small3.2:24b (strong function calling)
+- Default          -> qwen3.5:14b       (improved reasoning, best all-rounder)
+"""
+
+import os
+import re
+import json
+import httpx
+from typing import Tuple
+from fastapi import FastAPI, Request, Response
+
+OLLA_URL = os.environ.get("OLLA_URL", "http://olla:40114")
+LISTEN_HOST = os.environ.get("LISTEN_HOST", "0.0.0.0")
+LISTEN_PORT = int(os.environ.get("LISTEN_PORT", "40115"))
+
+MODELS = {
+    "diagnostics":      "qwen2.5:14b",
+    "scripting":        "qwen2.5-coder:14b",
+    "reasoning":        "deepseek-r1:14b",
+    "longform":         "gemma3:12b",
+    "heavy":            "gemma4:27b",
+    "tools":            "mistral-small3.2:24b",
+    "default":          "qwen3.5:14b",
+}
+
+PATTERNS = {
+    "diagnostics": [
+        r"\b(diagnos|health|status|check|monitor|alert|reachable|unreachable|uptime)\b",
+        r"\b(system report|get_all|list models|loaded models|vram)\b",
+        r"\b(is .+ running|is .+ up|is .+ down|ping)\b",
+        r"\b(ollama|open.?webui|pipeline|container|docker)\b",
+        r"\b(gpu|cpu|memory|ram|disk usage)\b",
+        r"\b(logs? file|journal|syslog|dmesg|kern)\b",
+    ],
+    "scripting": [
+        r"\b(script|bash|shell|command|cron|systemd|service|config)\b",
+        r"\b(yaml|compose|dockerfile|ansible|terraform)\b",
+        r"\b(fix|debug|error|traceback|exception|failed|exit code)\b",
+        r"\b(install|setup|configure|deploy|update|upgrade)\b",
+        r"\b(python|javascript|typescript|code|function|class|import)\b",
+        r"\b(write a|create a|generate|implement|refactor)\b.*\b(function|class|script|module)\b",
+    ],
+    "reasoning": [
+        r"\b(why|root cause|explain|analyze|compare|optimize|recommend)\b",
+        r"\b(should i|what would you|best approach|pros and cons|trade.?off)\b",
+        r"\b(performance|bottleneck|slow|latency|memory leak|high cpu)\b",
+        r"\b(architecture|design|strategy|best practice|decouple|refactor)\b",
+        r"\b(math|calculate|derive|proof|theorem|logic|reason)\b",
+    ],
+    "longform": [
+        r"\b(log|logs|summarize|summary|document|report)\b",
+        r"\b(what does this mean|walk me through|step by step|explain this)\b",
+        r"\b(write a|draft a|create a document|generate a report)\b",
+        r"\b(review|proofread|edit|rewrite|format|structure)\b",
+    ],
+    "heavy": [
+        r"\b(analyze this entire|full analysis|comprehensive review)\b",
+        r"\b(large context|long document|big codebase|entire project)\b",
+        r"\b(complex|sophisticated|architectural|system.?wide)\b",
+    ],
+}
+
+
+def classify(text: str) -> Tuple[str, str]:
+    t = text.lower()
+    # Score each category
+    best_category = "default"
+    best_score = 0
+    for category, patterns in PATTERNS.items():
+        score = 0
+        for pattern in patterns:
+            matches = re.findall(pattern, t)
+            score += len(matches)
+        if score > best_score:
+            best_score = score
+            best_category = category
+    return MODELS[best_category], best_category
+
+
+def should_route(body: bytes, path: str) -> bool:
+    """Only route chat completion requests for local models."""
+    if not path.startswith("v1/chat/completions"):
+        return False
+    try:
+        data = json.loads(body)
+        model = data.get("model", "")
+        # Skip cloud models — route those directly
+        if any(c in model for c in ("claude", "gemini", "gpt")):
+            return False
+        return True
+    except (json.JSONDecodeError, KeyError):
+        return False
+
+
+async def handle_request(body: bytes) -> bytes:
+    try:
+        data = json.loads(body)
+    except json.JSONDecodeError:
+        return body
+
+    messages = data.get("messages", [])
+    if not messages:
+        return body
+
+    user_message = ""
+    for m in reversed(messages):
+        if m.get("role") == "user":
+            content = m.get("content", "")
+            # Use text content from various formats
+            if isinstance(content, str):
+                user_message = content
+            elif isinstance(content, list):
+                for part in content:
+                    if isinstance(part, dict) and part.get("type") == "text":
+                        user_message = part.get("text", "")
+                        break
+            break
+
+    if user_message:
+        model, reason = classify(user_message)
+        data["model"] = model
+        print(f"[SmartRouter] '{user_message[:80]}...' -> {model} ({reason})")
+        return json.dumps(data).encode()
+
+    return body
+
+
+app = FastAPI(title="Smart Model Router")
+
+
+@app.api_route("/health", methods=["GET"])
+async def health():
+    return {"status": "ok"}
+
+
+@app.api_route("/v1/models", methods=["GET"])
+async def list_models():
+    """Return available models from Olla."""
+    async with httpx.AsyncClient(timeout=httpx.Timeout(10.0)) as client:
+        resp = await client.get(f"{OLLA_URL}/olla/ollama/v1/models")
+        return Response(content=resp.content, status_code=resp.status_code, headers=dict(resp.headers))
+
+
+@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
+async def proxy(request: Request, path: str):
+    if path.startswith("v1/"):
+        body = await request.body()
+
+        if body and should_route(body, path):
+            body = await handle_request(body)
+
+        async with httpx.AsyncClient(timeout=httpx.Timeout(connect=10.0, read=300.0, write=10.0, pool=10.0)) as client:
+            url = f"{OLLA_URL}/olla/ollama/{path}"
+            headers = dict(request.headers)
+            headers.pop("host", None)
+            headers.pop("content-length", None)
+
+            response = await client.request(
+                method=request.method,
+                url=url,
+                headers=headers,
+                content=body,
+                params=dict(request.query_params),
+            )
+
+            return Response(
+                content=response.content,
+                status_code=response.status_code,
+                headers=dict(response.headers),
+            )
+
+    return Response(status_code=404)
+
+
+def main():
+    import uvicorn
+    print(f"[SmartRouter] Listening on {LISTEN_HOST}:{LISTEN_PORT}")
+    print(f"[SmartRouter] Forwarding to Olla at {OLLA_URL}")
+    print(f"[SmartRouter] Models: {', '.join(MODELS.values())}")
+    uvicorn.run(app, host=LISTEN_HOST, port=LISTEN_PORT)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/discover-herd.sh b/scripts/discover-herd.sh
new file mode 100644
index 0000000..b7e984a
--- /dev/null
+++ b/scripts/discover-herd.sh
@@ -0,0 +1,133 @@
+#!/usr/bin/env bash
+# discover-herd.sh — mDNS discovery of Ollama nodes on LAN
+#
+# Uses avahi-browse to find _ollama._tcp services, then:
+#   1. Writes discovered nodes to .env as OLLAMA_REMOTE_* entries
+#   2. Regenerates olla.yaml via generate-olla-config.sh
+#
+# Also scans common ports (11434, 11435) on the local subnet as fallback.
+#
+# Usage:
+#   ./scripts/discover-herd.sh              # scan, prompt before writing
+#   ./scripts/discover-herd.sh --apply      # scan and write without prompt
+#   ./scripts/discover-herd.sh --dry-run    # scan and print, don't write
+
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+ENV_FILE="${PROJECT_DIR}/.env"
+
+MODE="${1:---prompt}"
+declare -a DISCOVERED=()
+
+# ── mDNS discovery via avahi ────────────────────────────────────────────────
+discover_mdns() {
+    if ! command -v avahi-browse &>/dev/null; then
+        return 1
+    fi
+    local services
+    services=$(avahi-browse _ollama._tcp --resolve --terminate --parsable 2>/dev/null || true)
+    if [[ -z "$services" ]]; then
+        return 1
+    fi
+    while IFS=';' read -r _ _ _ _ host _ _ port _; do
+        host="${host%%.*}"
+        host="${host%local}"
+        if [[ -n "$host" && -n "$port" ]]; then
+            DISCOVERED+=("${host}=http://${host}:${port}")
+        fi
+    done <<< "$services"
+}
+
+# ── Subnet scan fallback ────────────────────────────────────────────────────
+discover_subnet() {
+    local iface
+    iface=$(ip route get 1 | awk '{print $5; exit}')
+    [[ -z "$iface" ]] && return 1
+    local subnet
+    subnet=$(ip -o -f inet addr show "$iface" | awk '{print $4}')
+    [[ -z "$subnet" ]] && return 1
+    local base="${subnet%.*}"
+    echo "  scanning ${base}.0/24 ports 11434, 11435..." >&2
+    for host in $(seq 1 254); do
+        local ip="${base}.${host}"
+        for port in 11434 11435; do
+            if timeout 1 bash -c "echo > /dev/tcp/${ip}/${port}" 2>/dev/null; then
+                local name="auto_${host}"
+                DISCOVERED+=("${name}=http://${ip}:${port}")
+            fi
+        done
+    done
+}
+
+# ── Write to .env ────────────────────────────────────────────────────────────
+write_env() {
+    local tmpf
+    tmpf=$(mktemp)
+    # Remove existing auto-discovered entries
+    while IFS= read -r line; do
+        if [[ "$line" =~ ^OLLAMA_REMOTE_auto_ ]]; then
+            continue
+        fi
+        echo "$line" >> "$tmpf"
+    done < "$ENV_FILE"
+    # Append new discoveries
+    for entry in "${DISCOVERED[@]}"; do
+        local name="${entry%%=*}"
+        local url="${entry#*=}"
+        echo "OLLAMA_REMOTE_${name}=${url}" >> "$tmpf"
+    done
+    mv "$tmpf" "$ENV_FILE"
+    echo "→ wrote ${#DISCOVERED[@]} discovered nodes to .env"
+}
+
+# ── Main ────────────────────────────────────────────────────────────────────
+main() {
+    echo "→ Discovering Ollama nodes on LAN..."
+
+    if discover_mdns; then
+        echo "  mDNS: found ${#DISCOVERED[@]} node(s)"
+    else
+        echo "  mDNS: no _ollama._tcp services found (or avahi not running)"
+    fi
+
+    if [[ ${#DISCOVERED[@]} -eq 0 ]]; then
+        echo "  falling back to subnet scan..."
+        discover_subnet || true
+        echo "  subnet scan: found ${#DISCOVERED[@]} node(s)"
+    fi
+
+    if [[ ${#DISCOVERED[@]} -eq 0 ]]; then
+        echo "→ no remote Ollama nodes discovered"
+        return 0
+    fi
+
+    echo ""
+    for entry in "${DISCOVERED[@]}"; do
+        echo "  ${entry%%=*} → ${entry#*=}"
+    done
+    echo ""
+
+    if [[ "$MODE" == "--dry-run" ]]; then
+        echo "→ dry-run — not writing"
+        return 0
+    fi
+
+    if [[ "$MODE" == "--apply" ]]; then
+        write_env
+    else
+        read -rp "Write these to .env and regenerate olla.yaml? [y/N] " reply
+        if [[ "$reply" =~ ^[yY] ]]; then
+            write_env
+        else
+            echo "→ skipped"
+            return 0
+        fi
+    fi
+
+    # Regenerate olla.yaml
+    bash "${SCRIPT_DIR}/generate-olla-config.sh"
+    echo "→ done — restart Olla to pick up changes: docker compose restart olla"
+}
+
+main
diff --git a/scripts/discover-network.sh b/scripts/discover-network.sh
new file mode 100755
index 0000000..8c126b0
--- /dev/null
+++ b/scripts/discover-network.sh
@@ -0,0 +1,622 @@
+#!/usr/bin/env bash
+# discover-network.sh — discover AI services on LAN and VPN networks
+#
+# Strategy:
+#   1. Accept seed hosts from user (host:port) — these unlock VPN subnets
+#   2. Verify each seed via API probe (Ollama/Olla/LiteLLM/OpenCode)
+#   3. If Olla: harvest its endpoint list for known node names
+#   4. Scan each seed's /24 subnet for more services (works over VPN)
+#   5. Auto-detect LAN /24 subnets (skips VPN — only seeds unlock those)
+#   6. Merge, verify, deduplicate, let user pick which to add/remove
+#
+# Usage:
+#   ./scripts/discover-network.sh                         # interactive
+#   ./scripts/discover-network.sh 10.10.0.201:11434       # seed(s) as args
+#   ./scripts/discover-network.sh --apply                 # add all, no prompt
+#   ./scripts/discover-network.sh --dry-run               # preview only
+
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+ENV_FILE="${PROJECT_DIR}/.env"
+
+MODE="${1:---prompt}"
+# If first arg is a seed (contains a colon), treat all args as seeds
+if [[ "$1" == *:* ]]; then
+    SEEDS=("$@")
+    MODE="--prompt"
+elif [[ "$1" == "--apply" || "$1" == "--dry-run" ]]; then
+    SEEDS=("${@:2}")
+else
+    SEEDS=()
+fi
+
+RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'
+BLUE='\033[0;34m'; BOLD='\033[1m'; RESET='\033[0m'
+info()  { echo -e "${BLUE}[INFO]${RESET}  $*"; }
+ok()    { echo -e "${GREEN}[OK]${RESET}    $*"; }
+warn()  { echo -e "${YELLOW}[WARN]${RESET}  $*"; }
+err()   { echo -e "${RED}[ERROR]${RESET} $*" >&2; }
+
+ALL_PORTS="11434,11435,40114,4000,14096"
+TARGET_PORTS=(11434 11435 40114 4000 14096)
+
+# ── Collect local IPs for same-machine filtering ──────────────────────────
+get_local_ips() {
+    hostname -I 2>/dev/null | tr ' ' '\n'
+    ip -o addr show 2>/dev/null | awk '{print $4}' | cut -d/ -f1 | grep -v ':'
+}
+
+# ── Verify a host:port and return "TYPE|details" ──────────────────────────
+verify_service() {
+    local host="$1" port="$2"
+    local base="http://${host}:${port}"
+
+    # Ollama
+    if [[ "$port" == "11434" || "$port" == "11435" ]]; then
+        local resp
+        resp=$(curl -sf --max-time 5 "${base}/api/tags" 2>/dev/null || true)
+        if echo "$resp" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+models=d.get('models',[])
+if models:
+    names=[m.get('name','') for m in models[:4]]
+    print(f'ollama|{\", \".join(names)}|{len(models)} models')
+else:
+    sys.exit(1)
+" 2>/dev/null; then
+            return
+        fi
+    fi
+
+    # Olla
+    if [[ "$port" == "40114" ]]; then
+        local resp
+        resp=$(curl -sf --max-time 5 "${base}/internal/health" 2>/dev/null || true)
+        if echo "$resp" | grep -q '"status":"ok"\|"status":"healthy"'; then
+            local ep_info
+            ep_info=$(curl -sf --max-time 5 "${base}/internal/status/endpoints" 2>/dev/null | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+eps=d.get('endpoints',[])
+# Format: name:type:status:models
+parts=[]
+for e in eps:
+    name=e.get('name','?')
+    st=e.get('status','?')
+    mc=e.get('model_count','?')
+    parts.append(f'{name}={st}({mc})')
+print(' | '.join(parts))
+" 2>/dev/null || echo "?")
+            echo "olla||endpoints: ${ep_info}"
+            return
+        fi
+    fi
+
+    # LiteLLM
+    if [[ "$port" == "4000" ]]; then
+        local resp
+        resp=$(curl -sf --max-time 5 "${base}/health/liveness" 2>/dev/null || \
+               curl -sf --max-time 5 "${base}/health" 2>/dev/null || true)
+        if echo "$resp" | grep -q '"status":"ok"'; then
+            local mc
+            mc=$(curl -sf --max-time 5 "${base}/v1/models" 2>/dev/null | python3 -c "
+import sys,json; d=json.load(sys.stdin); print(len(d.get('data',[])))
+" 2>/dev/null || echo "?")
+            echo "litellm||${mc} models"
+            return
+        fi
+    fi
+
+    # OpenCode
+    if [[ "$port" == "14096" ]]; then
+        local resp
+        resp=$(curl -sf --max-time 5 "${base}/" 2>/dev/null || true)
+        if echo "$resp" | grep -qi 'opencode'; then
+            echo "opencode||serve"
+            return
+        fi
+    fi
+}
+
+# ── Verify and label a discovered host ────────────────────────────────────
+# Returns "host|port|type|details" or empty string
+discover_and_label() {
+    local host="$1" port="$2"
+    local verified
+    verified=$(verify_service "$host" "$port" || true)
+    if [[ -n "$verified" ]]; then
+        echo "${host}|${port}|${verified}"
+    fi
+}
+
+# ── Scan a /24 subnet for target ports ────────────────────────────────────
+scan_24() {
+    local subnet="$1"
+    if command -v nmap &>/dev/null; then
+        nmap -p "$ALL_PORTS" --open -T4 -n "${subnet}" 2>/dev/null | \
+            awk '/^Nmap scan report for/{host=$NF} /^[0-9]+\/tcp/{print host ":" $1}' | \
+            cut -d/ -f1
+    else
+        local base
+        base=$(echo "$subnet" | sed 's/\.0\/24$//;s/\.0$//')
+        warn "nmap not found — slow scan"
+        for i in $(seq 1 254); do
+            for port in "${TARGET_PORTS[@]}"; do
+                (echo >/dev/tcp/"${base}.${i}"/"${port}") 2>/dev/null && echo "${base}.${i}:${port}" &
+            done
+            wait
+        done 2>/dev/null
+    fi
+}
+
+# ── Get /24 subnet from an IP ─────────────────────────────────────────────
+ip_to_24() {
+    local ip="$1"
+    echo "$(echo "$ip" | cut -d. -f1-3).0/24"
+}
+
+# ── Check if a subnet is a VPN interface ──────────────────────────────────
+is_vpn_subnet() {
+    local subnet="$1"
+    # Check routing table for VPN interfaces
+    ip route show table all 2>/dev/null | grep -E "dev (wg|tun|wt|tailscale)" | awk '{print $1}' | \
+        grep -q "${subnet}" || return 1
+}
+
+# ── Auto-detect LAN subnets (non-VPN /24s) ────────────────────────────────
+detect_lan_subnets() {
+    local subnets=()
+    local added=""
+    while IFS= read -r line; do
+        local dest dev
+        dest=$(echo "$line" | awk '{print $1}')
+        dev=$(echo "$line" | awk '{print $3}')
+        [[ "$dest" == "default" ]] && continue
+        [[ "$dest" == *:* ]] && continue
+        [[ "$dest" != */24 ]] && continue
+        [[ "$dev" == docker* ]] && continue
+        [[ "$dev" == br-* ]] && continue
+        [[ "$dev" == veth* ]] && continue
+        [[ "$dev" == lo ]] && continue
+        [[ "$dev" == lxcbr* ]] && continue
+        [[ "$dev" == virbr* ]] && continue
+        [[ "$dest" == 172.* ]] && continue
+        # Skip VPN interfaces
+        [[ "$dev" == wg* ]] && continue
+        [[ "$dev" == tun* ]] && continue
+        [[ "$dev" == wt* ]] && continue
+        if echo "$added" | grep -q "${dest} "; then
+            continue
+        fi
+        added="${added} ${dest} "
+        subnets+=("${dest} (${dev})")
+    done < <(ip route show table all 2>/dev/null | grep -v 'unreachable\|prohibit\|broadcast\|local\|fe80')
+    printf '%s\n' "${subnets[@]}"
+}
+
+# ── Resolve Olla endpoint names to IPs via DNS ──────────────────────────
+# Queries the Olla endpoint list API, then resolves each name via VPN DNS.
+# Arguments: olla_host olla_port
+# Returns: lines of "host|port|type|details" for resolved + verified endpoints
+resolve_olla_endpoints() {
+    local host="$1" port="$2"
+    local base="http://${host}:${port}"
+
+    local resp
+    resp=$(curl -sf --max-time 5 "${base}/internal/status/endpoints" 2>/dev/null || true)
+    [[ -z "$resp" ]] && return
+
+    local names
+    names=$(echo "$resp" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+for e in d.get('endpoints',[]):
+    name=e.get('name','')
+    if name and name != 'litellm-cloud':
+        print(name)
+" 2>/dev/null || true)
+    [[ -z "$names" ]] && return
+
+    local dns_server="" dns_domain=""
+    local vpn_iface
+    vpn_iface=$(ip route show table all 2>/dev/null | grep -E "dev (wg|tun|wt|tailscale)" | awk '{print $3}' | head -1 || true)
+    if [[ -n "$vpn_iface" ]]; then
+        dns_server=$(resolvectl dns "$vpn_iface" 2>/dev/null | awk '{print $NF}' || true)
+        dns_domain=$(resolvectl domain "$vpn_iface" 2>/dev/null | awk '{print $NF}' || true)
+    fi
+    [[ -z "$dns_server" ]] && dns_server=$(grep -m1 '^nameserver' /etc/resolv.conf 2>/dev/null | awk '{print $2}' || true)
+
+    local has_dig=false
+    command -v dig &>/dev/null && has_dig=true
+
+    local results=()
+    while IFS= read -r name; do
+        [[ -z "$name" ]] && continue
+        local resolved=""
+        if [[ "$has_dig" == "true" && -n "$dns_server" ]]; then
+            resolved=$(dig +short "$name" @"$dns_server" 2>/dev/null | head -1 || true)
+            if [[ -z "$resolved" && -n "$dns_domain" ]]; then
+                resolved=$(dig +short "${name}.${dns_domain}" @"$dns_server" 2>/dev/null | head -1 || true)
+            fi
+            if [[ -z "$resolved" && -n "$dns_domain" ]]; then
+                resolved=$(dig +short "bms-${name}.${dns_domain}" @"$dns_server" 2>/dev/null | head -1 || true)
+            fi
+        fi
+        if [[ -n "$resolved" ]]; then
+            local result
+            result=$(discover_and_label "$resolved" "11434" || true)
+            if [[ -n "$result" ]]; then
+                results+=("$result")
+                ok "  ${resolved}:11434 = $(echo "$result" | cut -d'|' -f3) (Olla endpoint '${name}')"
+            fi
+        fi
+    done <<< "$names"
+    printf '%s\n' "${results[@]}"
+}
+
+# ── List OLLAMA_REMOTE_* currently in .env ───────────────────────────────
+list_existing() {
+    grep '^OLLAMA_REMOTE_' "$ENV_FILE" 2>/dev/null || true
+}
+
+# ── Add a service ────────────────────────────────────────────────────────
+add_service() {
+    local host="$1" port="$2" svc_type="$3"
+    if [[ "$svc_type" != "ollama" ]]; then
+        echo "skipped"
+        return
+    fi
+    local name
+    name=$(echo "${host}" | tr '.-' '_')
+    local var="OLLAMA_REMOTE_${name}"
+    if grep -q "^${var}=" "$ENV_FILE" 2>/dev/null; then
+        echo "exists"
+        return
+    fi
+    echo "OLLAMA_REMOTE_${name}=http://${host}:${port}" >> "$ENV_FILE"
+    echo "added"
+}
+
+# ── Interactive selection menu ───────────────────────────────────────────
+interactive_menu() {
+    local discovered=("$@")
+    local existing=()
+    while IFS= read -r line; do
+        existing+=("$line")
+    done < <(list_existing)
+
+    echo ""
+    echo -e "${BOLD}${BLUE}Discovered AI Services${RESET}"
+    echo -e "${BLUE}────────────────────────────────────────────────────────────────${RESET}"
+    printf "  %-3s %-21s %-10s %-45s\n" "#" "Host:Port" "Type" "Details"
+    echo "  ────────────────────────────────────────────────────────────────"
+    local i=1
+    for entry in "${discovered[@]}"; do
+        IFS='|' read -r host port svc_type svc_name svc_ver <<< "$entry"
+        local details="${svc_name}"
+        [[ -n "$svc_ver" ]] && details="${svc_ver}"
+        printf "  %-3d %-21s %-10s %-45s\n" "$i" "${host}:${port}" "${svc_type}" "${details}"
+        i=$((i + 1))
+    done
+
+    if [[ ${#existing[@]} -gt 0 ]]; then
+        echo ""
+        echo -e "${YELLOW}Currently configured in .env:${RESET}"
+        for line in "${existing[@]}"; do
+            echo "    ${line}"
+        done
+    fi
+
+    echo ""
+    if [[ "$MODE" == "--dry-run" ]]; then
+        info "Dry-run — no changes"
+        info "  Would prompt to add ${#discovered[@]} discovered and remove ${#existing[@]} configured"
+        return
+    fi
+
+    local doing=""
+    while true; do
+        echo ""
+        echo "  a <nums|all>  — add discovered services by number"
+        echo "  r <nums|all>  — remove existing OLLAMA_REMOTE_* entries"
+        echo "  d             — done (regenerate + restart hint)"
+        echo ""
+        read -rp "  Choose action [a/r/d]: " cmd args
+
+        case "${cmd}" in
+            a|add)
+                if [[ -z "${args:-}" ]]; then
+                    read -rp "    Enter numbers (e.g. 1,3,5 or 'all'): " args
+                fi
+                local added=0 existed=0 skipped=0
+                if [[ "$args" == "all" ]]; then
+                    for entry in "${discovered[@]}"; do
+                        IFS='|' read -r h p t _ _ <<< "$entry"
+                        result=$(add_service "$h" "$p" "$t")
+                        case "$result" in
+                            added)   (( added++ )) || true ;;
+                            exists)  (( existed++ )) || true ;;
+                            skipped) (( skipped++ )) || true ;;
+                        esac
+                    done
+                else
+                    local IFS=,
+                    for num in $args; do
+                        num=$(echo "$num" | xargs)
+                        local idx=$((num - 1))
+                        if [[ $idx -ge 0 && $idx -lt ${#discovered[@]} ]]; then
+                            IFS='|' read -r h p t _ _ <<< "${discovered[$idx]}"
+                            result=$(add_service "$h" "$p" "$t")
+                            case "$result" in
+                                added)   ok "Added ${h}:${p} (${t})";  (( added++ )) || true ;;
+                                exists)  warn "${h}:${p} already in .env"; (( existed++ )) || true ;;
+                                skipped) warn "${h}:${p} (${t}) — not Ollama, can't add via OLLAMA_REMOTE_*"; (( skipped++ )) || true ;;
+                            esac
+                        fi
+                    done
+                fi
+                info "Result: ${added} added, ${existed} exists, ${skipped} skipped"
+                doing="changed"
+                ;;
+
+            r|remove)
+                if [[ -z "${args:-}" ]]; then
+                    read -rp "    Enter numbers (1-${#existing[@]} or 'all'): " args
+                fi
+                if [[ ${#existing[@]} -eq 0 ]]; then
+                    warn "No entries to remove"
+                    continue
+                fi
+                local removed=0
+                if [[ "$args" == "all" ]]; then
+                    for line in "${existing[@]}"; do
+                        local var; var=$(echo "$line" | cut -d= -f1)
+                        sed -i "/^${var}=/d" "$ENV_FILE"
+                        ok "Removed ${var}"
+                        (( removed++ )) || true
+                    done
+                    existing=()
+                else
+                    # Parse and sort descending
+                    local sorted_nums
+                    sorted_nums=$(echo "$args" | tr ',' '\n' | sort -rn) || true
+                    while IFS= read -r num; do
+                        num=$(echo "$num" | xargs)
+                        [[ -z "$num" ]] && continue
+                        local idx=$((num - 1))
+                        if [[ $idx -ge 0 && $idx -lt ${#existing[@]} ]]; then
+                            local var; var=$(echo "${existing[$idx]}" | cut -d= -f1)
+                            sed -i "/^${var}=/d" "$ENV_FILE"
+                            ok "Removed ${var}"
+                            (( removed++ )) || true
+                        fi
+                    done <<< "$sorted_nums"
+                    existing=()
+                    while IFS= read -r line; do existing+=("$line"); done < <(list_existing)
+                fi
+                info "Removed ${removed} entry/entries"
+                doing="changed"
+                ;;
+
+            d|done)
+                if [[ "$doing" == "changed" ]]; then
+                    echo ""
+                    info "Regenerating Olla config..."
+                    bash "${SCRIPT_DIR}/generate-olla-config.sh"
+                    echo ""
+                    ok "Done! Restart stack: sudo systemctl restart ai-stack.service"
+                else
+                    info "No changes made"
+                fi
+                break
+                ;;
+
+            *) warn "Unknown action: ${cmd}. Use a, r, or d." ;;
+        esac
+    done
+}
+
+# ── Apply all discovered ─────────────────────────────────────────────────
+apply_all() {
+    local discovered=("$@")
+    local added=0 existed=0 skipped=0
+    for entry in "${discovered[@]}"; do
+        IFS='|' read -r h p t _ _ <<< "$entry"
+        result=$(add_service "$h" "$p" "$t")
+        case "$result" in
+            added)   (( added++ )) || true ;;
+            exists)  (( existed++ )) || true ;;
+            skipped) (( skipped++ )) || true ;;
+        esac
+    done
+    info "Result: ${added} added, ${existed} exists, ${skipped} skipped"
+    if [[ "$added" -gt 0 ]]; then
+        info "Regenerating Olla config..."
+        bash "${SCRIPT_DIR}/generate-olla-config.sh"
+        ok "Done! Restart stack: sudo systemctl restart ai-stack.service"
+    fi
+}
+
+# ── Deduplicate discovered entries ───────────────────────────────────────
+deduplicate() {
+    local entries=("$@")
+    local seen=""
+    for entry in "${entries[@]}"; do
+        local key
+        key=$(echo "$entry" | cut -d'|' -f1-2)
+        if echo "$seen" | grep -q "${key} "; then
+            continue
+        fi
+        seen="${seen} ${key} "
+        echo "$entry"
+    done
+}
+
+# ── Main ─────────────────────────────────────────────────────────────────
+main() {
+    echo ""
+    info "AI Service Discovery"
+    echo ""
+
+    # Collect local IPs
+    local local_ips=()
+    while IFS= read -r ip; do
+        ip=$(echo "$ip" | xargs)
+        [[ -n "$ip" ]] && local_ips+=("$ip")
+    done < <(get_local_ips)
+
+    # ── Step 1: Seeds ──────────────────────────────────────────────────
+    if [[ ${#SEEDS[@]} -eq 0 && "$MODE" != "--apply" && "$MODE" != "--dry-run" ]]; then
+        echo ""
+        read -rp "Enter known AI host:port (space-separated, e.g. '10.10.0.201:11434'), or press Enter to auto-scan LAN: " seed_input
+        if [[ -n "$seed_input" ]]; then
+            IFS=' ' read -ra SEEDS <<< "$seed_input"
+        fi
+    fi
+
+    declare -a all_found
+
+    if [[ ${#SEEDS[@]} -gt 0 ]]; then
+        info "Processing ${#SEEDS[@]} seed(s)..."
+        local seeds_to_expand=()
+        for seed in "${SEEDS[@]}"; do
+            local host port
+            host=$(echo "$seed" | cut -d: -f1)
+            port=$(echo "$seed" | cut -d: -f2)
+            [[ -z "$port" ]] && port="11434"
+            info "Verifying seed ${host}:${port}..."
+            local result
+            result=$(discover_and_label "$host" "$port" || true)
+            if [[ -n "$result" ]]; then
+                all_found+=("$result")
+                local svc_type
+                svc_type=$(echo "$result" | cut -d'|' -f3)
+                ok "  ${host}:${port} = ${svc_type}"
+                # Check other ports on the same host (co-located services)
+                for p in "${TARGET_PORTS[@]}"; do
+                    if [[ "$p" != "$port" ]]; then
+                        local extra
+                        extra=$(discover_and_label "$host" "$p" || true)
+                        if [[ -n "$extra" ]]; then
+                            all_found+=("$extra")
+                            local et; et=$(echo "$extra" | cut -d'|' -f3)
+                            ok "  ${host}:${p} = ${et} (co-located)"
+                        fi
+                    fi
+                done
+                seeds_to_expand+=("$host")
+            else
+                warn "  ${host}:${port} — no response or unknown service"
+            fi
+        done
+
+        # ── Step 2: Expand from seeds (scan their /24) ────────────────
+        local expanded_networks=""
+        for shost in "${seeds_to_expand[@]}"; do
+            local subnet_24
+            subnet_24=$(ip_to_24 "$shost")
+            if echo "$expanded_networks" | grep -q "${subnet_24} "; then
+                continue
+            fi
+            expanded_networks="${expanded_networks} ${subnet_24} "
+            info "Expanding: scanning ${subnet_24}..."
+            while IFS= read -r hit; do
+                [[ -z "$hit" ]] && continue
+                local h p
+                h=$(echo "$hit" | cut -d: -f1)
+                p=$(echo "$hit" | cut -d: -f2)
+                # Skip local machine
+                local skip=false
+                for lip in "${local_ips[@]}"; do [[ "$h" == "$lip" ]] && skip=true && break; done
+                [[ "$skip" == "true" ]] && { warn "  ${h}:${p} — local host, skip"; continue; }
+                local result
+                result=$(discover_and_label "$h" "$p" || true)
+                if [[ -n "$result" ]]; then
+                    all_found+=("$result")
+                    local st; st=$(echo "$result" | cut -d'|' -f3)
+                    ok "  ${h}:${p} = ${st}"
+                fi
+            done < <(scan_24 "$subnet_24" 2>/dev/null || true)
+        done
+    fi
+
+    # ── Step 3: Resolve Olla endpoint names via DNS ────────────────────
+    # If any Olla instances were found, try resolving their named endpoints
+    # through the VPN DNS server to discover additional hosts.
+    local olla_done="" olla_resolved=()
+    for entry in "${all_found[@]}"; do
+        local ehost eport etype
+        ehost=$(echo "$entry" | cut -d'|' -f1)
+        eport=$(echo "$entry" | cut -d'|' -f2)
+        etype=$(echo "$entry" | cut -d'|' -f3)
+        if [[ "$etype" == "olla" ]]; then
+            local key="${ehost}:${eport}"
+            if echo "$olla_done" | grep -q "${key} "; then
+                continue
+            fi
+            olla_done="${olla_done} ${key} "
+            info "Resolving Olla endpoints from ${ehost}:${eport}..."
+            while IFS= read -r olla_entry; do
+                [[ -n "$olla_entry" ]] && olla_resolved+=("$olla_entry")
+            done < <(resolve_olla_endpoints "$ehost" "$eport" || true)
+        fi
+    done
+    for entry in "${olla_resolved[@]}"; do
+        all_found+=("$entry")
+    done
+
+    # ── Step 4: Auto-detect LAN ────────────────────────────────────────
+    info "Auto-detecting LAN subnets..."
+    local lan_subnets=()
+    while IFS= read -r subnet; do
+        lan_subnets+=("$subnet")
+    done < <(detect_lan_subnets)
+
+    if [[ ${#lan_subnets[@]} -gt 0 ]]; then
+        info "Scanning ${#lan_subnets[@]} LAN subnet(s)..."
+        for subnet in "${lan_subnets[@]}"; do
+            local sn
+            sn=$(echo "$subnet" | cut -d' ' -f1)
+            info "  ${sn}..."
+            while IFS= read -r hit; do
+                [[ -z "$hit" ]] && continue
+                local h p
+                h=$(echo "$hit" | cut -d: -f1)
+                p=$(echo "$hit" | cut -d: -f2)
+                local skip=false
+                for lip in "${local_ips[@]}"; do [[ "$h" == "$lip" ]] && skip=true && break; done
+                [[ "$skip" == "true" ]] && continue
+                local result
+                result=$(discover_and_label "$h" "$p" || true)
+                if [[ -n "$result" ]]; then
+                    all_found+=("$result")
+                fi
+            done < <(scan_24 "$sn" 2>/dev/null || true)
+        done
+    fi
+
+    # ── Step 5: Deduplicate ───────────────────────────────────────────
+    declare -a final=()
+    while IFS= read -r entry; do
+        final+=("$entry")
+    done < <(deduplicate "${all_found[@]}")
+
+    if [[ ${#final[@]} -eq 0 ]]; then
+        warn "No AI services found"
+        exit 0
+    fi
+
+    echo ""
+    info "Found ${#final[@]} unique service(s)"
+
+    # ── Step 6: Present and act ───────────────────────────────────────
+    if [[ "$MODE" == "--apply" ]]; then
+        apply_all "${final[@]}"
+    else
+        interactive_menu "${final[@]}"
+    fi
+}
+
+main "$@"
diff --git a/scripts/generate-keys.sh b/scripts/generate-keys.sh
index 974aa85..a7ccab2 100755
--- a/scripts/generate-keys.sh
+++ b/scripts/generate-keys.sh
@@ -2,30 +2,16 @@
 # generate-keys.sh
 #
 # Generates cryptographically secure keys for ai-stack services.
-# Output can be copy-pasted into .env or used with VaultWarden.
+# Output can be copy-pasted into .env.
 #
 # Usage:
 #   ./scripts/generate-keys.sh              # print all keys
-#   ./scripts/generate-keys.sh --webui      # print only WEBUI_SECRET_KEY
-#   ./scripts/generate-keys.sh --khoj       # print only KHOJ_DJANGO_SECRET_KEY
-#   ./scripts/generate-keys.sh --litellm     # print only LITELLM_MASTER_KEY
+#   ./scripts/generate-keys.sh --litellm    # print only LITELLM_MASTER_KEY
 
 set -euo pipefail
 
 GREEN='\033[0;32m'; BLUE='\033[0;34m'; RESET='\033[0m'
 
-gen_webui() {
-  echo -e "${BLUE}WEBUI_SECRET_KEY (Fernet - 32 url-safe base64 bytes):${RESET}"
-  python3 -c "from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())"
-  echo ""
-}
-
-gen_khoj() {
-  echo -e "${BLUE}KHOJ_DJANGO_SECRET_KEY (Django secret - 50 chars):${RESET}"
-  python3 -c "import secrets; print(secrets.token_urlsafe(37))"
-  echo ""
-}
-
 gen_litellm() {
   echo -e "${BLUE}LITELLM_MASTER_KEY (API key - 32 chars):${RESET}"
   python3 -c "import secrets; print('sk-local-' + secrets.token_hex(16))"
@@ -33,20 +19,11 @@ gen_litellm() {
 }
 
 gen_all() {
-  echo -e "${GREEN}═══ Generated Keys (copy to .env or VaultWarden) ═══${RESET}\n"
-  gen_webui
-  gen_khoj
+  echo -e "${GREEN}═══ Generated Keys (copy to .env) ═══${RESET}\n"
   gen_litellm
-  echo -e "${BLUE}PIPELINES_API_KEY / OPEN_TERMINAL_API_KEY (32 chars):${RESET}"
-  python3 -c "import secrets; print(secrets.token_hex(16))"
-  echo ""
 }
 
-if [[ "${1:-}" == "--webui" ]]; then
-  gen_webui
-elif [[ "${1:-}" == "--khoj" ]]; then
-  gen_khoj
-elif [[ "${1:-}" == "--litellm" ]]; then
+if [[ "${1:-}" == "--litellm" ]]; then
   gen_litellm
 else
   gen_all
diff --git a/scripts/generate-olla-config.sh b/scripts/generate-olla-config.sh
index f1d0f2e..261af15 100755
--- a/scripts/generate-olla-config.sh
+++ b/scripts/generate-olla-config.sh
@@ -52,8 +52,8 @@ OLLA_LOAD_BALANCER="${OLLA_LOAD_BALANCER:-least-connections}"
 OLLA_REQUEST_LOGGING="${OLLA_REQUEST_LOGGING:-true}"
 
 # ── Collect OLLAMA_REMOTE_* entries ────────────────────────────────────────────
-declare -A REMOTE_URLS
-declare -A REMOTE_PRIORITIES
+declare -A REMOTE_URLS=()
+declare -A REMOTE_PRIORITIES=()
 
 if [[ -f "$ENV_FILE" ]]; then
   while IFS='=' read -r key val; do
@@ -61,11 +61,18 @@ if [[ -f "$ENV_FILE" ]]; then
     name="${BASH_REMATCH[1]}"
     val="${val%%#*}"; val="${val#"${val%%[![:space:]]*}"}"; val="${val%"${val##*[![:space:]]}"}"
     [[ -n "$val" ]] || continue
-    # Check for inline priority suffix: url:port:N
-    if [[ "$val" =~ ^(.*):([0-9]+)$ ]]; then
+    # Parse format: url:port[:priority]
+    if [[ "$val" =~ ^(.*:[0-9]+):([0-9]+)$ ]]; then
+      # Three-part: url:port:priority
       REMOTE_URLS["$name"]="${BASH_REMATCH[1]}"
       REMOTE_PRIORITIES["$name"]="${BASH_REMATCH[2]}"
+    elif [[ "$val" =~ ^.*:[0-9]+$ ]]; then
+      # Two-part: url:port (no explicit priority)
+      REMOTE_URLS["$name"]="$val"
+      priority_var="OLLAMA_REMOTE_${name}_PRIORITY"
+      REMOTE_PRIORITIES["$name"]="${!priority_var:-70}"
     else
+      # Bare URL: only host, no port
       REMOTE_URLS["$name"]="$val"
       priority_var="OLLAMA_REMOTE_${name}_PRIORITY"
       REMOTE_PRIORITIES["$name"]="${!priority_var:-70}"
diff --git a/scripts/resolve-vaultwarden.sh b/scripts/resolve-vaultwarden.sh
index 49218ef..f2745f3 100755
--- a/scripts/resolve-vaultwarden.sh
+++ b/scripts/resolve-vaultwarden.sh
@@ -1,18 +1,231 @@
 #!/usr/bin/env bash
-# resolve-vaultwarden.sh
-# Disabled: VaultWarden integration not currently configured
-# To enable: set BW_CLIENT_ID, BW_CLIENT_SECRET, VAULT_MASTER_PASSWORD and uncomment below
-
-set -o pipefail
+# resolve-vaultwarden.sh — resolve <vaultwarden:path> placeholders in .env
+#
+# Reads .env, finds <vaultwarden:organization/item> placeholders, fetches
+# the actual values from Bitwarden via the `bw` CLI, and writes them back.
+#
+# Placeholder formats:
+#   <vaultwarden:org-uuid/item-name>   — search item by name within an org
+#   <vaultwarden:item-uuid>            — fetch item by its UUID directly
+#
+# Authentication (in order of precedence):
+#   1. BW_CLIENT_ID + BW_CLIENT_SECRET + VAULT_MASTER_PASSWORD (API key)
+#   2. Existing `bw` session (already logged in and unlocked)
+#
+# Usage:
+#   ./scripts/resolve-vaultwarden.sh              # resolve in-place
+#   ./scripts/resolve-vaultwarden.sh --dry-run    # show what would change
+#
+# Env vars for auth:
+#   BW_SERVER_URL=<url>          (optional, self-hosted VaultWarden)
+#   BW_CLIENT_ID=user.xxxxxx
+#   BW_CLIENT_SECRET=...
+#   VAULT_MASTER_PASSWORD=...    (used to unlock locked vault)
+#
+# NOTE: bw login --apikey is incompatible with self-hosted VaultWarden
+# (the server doesn't return userDecryptionOptions). If API key login fails,
+# the script falls through to the existing session. For fresh setups, run
+# 'bw login' interactively first, or set BW_CLIENT_ID + BW_CLIENT_SECRET
+# and the script will attempt --apikey (may fail on VaultWarden).
 
+set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 ENV_FILE="${SCRIPT_DIR}/.env"
+DRY_RUN=false
+RESOLVED=0
+FAILED=0
+
+if [[ "${1:-}" == "--dry-run" ]]; then
+    DRY_RUN=true
+fi
+
+if [[ ! -f "$ENV_FILE" ]]; then
+    echo "→ .env not found — nothing to resolve"
+    exit 0
+fi
+
+# Check for placeholders
+if ! grep -q '<vaultwarden:' "$ENV_FILE" 2>/dev/null; then
+    echo "→ No <vaultwarden:...> placeholders found in .env"
+    exit 0
+fi
+
+echo "→ Resolving <vaultwarden:...> placeholders in .env..."
+
+# ── Check bw CLI ──────────────────────────────────────────────────────────
+if ! command -v bw &>/dev/null; then
+    echo "✗ Bitwarden CLI (bw) not found."
+    echo "  Install: npm install -g @bitwarden/cli"
+    echo "  Or: https://bitwarden.com/help/cli/"
+    exit 1
+fi
+
+# ── Configure server URL (self-hosted VaultWarden) ────────────────────────
+if [[ -n "${BW_SERVER_URL:-}" ]]; then
+    current_server=$(bw config server 2>/dev/null || echo "")
+    if [[ "$current_server" != "$BW_SERVER_URL" ]]; then
+        echo "  Configuring server: ${BW_SERVER_URL}"
+        bw config server "$BW_SERVER_URL" >/dev/null 2>&1
+    fi
+fi
+
+# ── Authenticate ──────────────────────────────────────────────────────────
+bw_login() {
+    if [[ -n "${BW_CLIENT_ID:-}" && -n "${BW_CLIENT_SECRET:-}" ]]; then
+        echo "  Authenticating with API key..."
+        export BW_SESSION
+        BW_SESSION=$(bw login --apikey --raw 2>/dev/null || true)
+        if [[ -z "$BW_SESSION" ]]; then
+            echo "✗ bw API key login failed. Check BW_CLIENT_ID and BW_CLIENT_SECRET."
+            return 1
+        fi
+    elif bw status 2>/dev/null | grep -q '"status":"unlocked"'; then
+        return 0
+    elif bw status 2>/dev/null | grep -q '"status":"locked"'; then
+        if [[ -n "${VAULT_MASTER_PASSWORD:-}" ]]; then
+            export BW_SESSION
+            BW_SESSION=$(echo "$VAULT_MASTER_PASSWORD" | bw unlock --raw 2>/dev/null || true)
+            if [[ -z "$BW_SESSION" ]]; then
+                echo "✗ Failed to unlock vault with VAULT_MASTER_PASSWORD."
+                return 1
+            fi
+        else
+            echo "✗ Vault is locked. Set VAULT_MASTER_PASSWORD or run 'bw unlock' manually."
+            return 1
+        fi
+    else
+        echo "✗ Not logged in to Bitwarden."
+        echo "  Set BW_CLIENT_ID and BW_CLIENT_SECRET, or run 'bw login' manually."
+        return 1
+    fi
+}
+
+if ! bw_login; then
+    exit 1
+fi
+
+BW_STATUS=$(bw status 2>/dev/null)
+if ! echo "$BW_STATUS" | grep -q '"status":"unlocked"'; then
+    echo "✗ Cannot unlock Bitwarden vault."
+    exit 1
+fi
+
+echo "  Bitwarden vault unlocked."
+
+# ── Fetch item value ──────────────────────────────────────────────────────
+# Usage: fetch_value <org-uuid-or-empty> <item-identifier>
+# Returns the password / notes / first custom field value.
+fetch_value() {
+    local org="$1"
+    local identifier="$2"
+    local item_json
+
+    # If identifier looks like a UUID (hex with dashes), try direct lookup first
+    if [[ "$identifier" =~ ^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$ ]]; then
+        item_json=$(bw get item "$identifier" 2>/dev/null || true)
+    fi
+
+    # Fall back to search
+    if [[ -z "${item_json:-}" ]]; then
+        if [[ -n "$org" ]]; then
+            item_json=$(bw list items --search "$identifier" --organizationid "$org" 2>/dev/null | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+if isinstance(data, list):
+    # Find best match: exact name match first, then partial
+    term = '$identifier'.lower()
+    for item in data:
+        name = item.get('name', '').lower()
+        if name == term:
+            print(json.dumps(item))
+            sys.exit(0)
+    # No exact match — take first result
+    if data:
+        print(json.dumps(data[0]))
+" 2>/dev/null || true)
+        else
+            item_json=$(bw list items --search "$identifier" 2>/dev/null | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+if isinstance(data, list) and data:
+    print(json.dumps(data[0]))
+" 2>/dev/null || true)
+        fi
+    fi
+
+    if [[ -z "$item_json" ]]; then
+        return 1
+    fi
+
+    # Extract value from the item — pipe JSON via stdin to avoid quoting issues
+    echo "$item_json" | python3 -c "
+import sys, json
+item = json.load(sys.stdin)
+# Priority: login.password > custom field named 'value' or 'secret' > notes
+if 'login' in item and isinstance(item['login'], dict) and 'password' in item['login']:
+    print(item['login']['password'], end='')
+elif 'fields' in item and isinstance(item['fields'], list):
+    for f in item['fields']:
+        name = (f.get('name') or '').lower()
+        if name in ('value', 'secret', 'key', 'token', 'apikey'):
+            print(f.get('value', ''), end='')
+            sys.exit(0)
+    if item['fields']:
+        print(item['fields'][0].get('value', ''), end='')
+elif 'notes' in item and item.get('notes'):
+    print(item['notes'], end='')
+else:
+    sys.exit(1)
+" 2>/dev/null || return 1
+}
+
+# ── Resolve placeholders ──────────────────────────────────────────────────
+TMP_ENV=$(mktemp)
+trap 'rm -f "$TMP_ENV"' EXIT
+
+while IFS= read -r line || [[ -n "$line" ]]; do
+    # Skip comment lines
+    [[ "$line" =~ ^[[:space:]]*# ]] && { echo "$line" >> "$TMP_ENV"; continue; }
+    if [[ "$line" =~ \<vaultwarden:([^>]+)\> ]]; then
+        placeholder="${BASH_REMATCH[0]}"
+        path="${BASH_REMATCH[1]}"
+
+        # Parse: org/item-name or just item-id
+        if [[ "$path" == */* ]]; then
+            org="${path%%/*}"
+            identifier="${path#*/}"
+        else
+            org=""
+            identifier="$path"
+        fi
+
+        echo "  Resolving: ${placeholder}"
+        value=$(fetch_value "$org" "$identifier" || true)
+
+        if [[ -n "$value" ]]; then
+            line="${line//${placeholder}/${value}}"
+            echo "    ✓ resolved"
+            (( RESOLVED++ )) || true
+        else
+            echo "    ✗ could not fetch — leaving placeholder"
+            (( FAILED++ )) || true
+        fi
+    fi
+    echo "$line" >> "$TMP_ENV"
+done < "$ENV_FILE"
 
-echo "→ VaultWarden resolution disabled (no BW_ credentials set)"
-echo "→ Using .env as-is"
+if [[ "$DRY_RUN" == "true" ]]; then
+    echo "→ Dry-run: ${RESOLVED} placeholder(s) would be resolved"
+    if command -v diff &>/dev/null; then
+        diff --color=always "$ENV_FILE" "$TMP_ENV" 2>/dev/null || true
+    fi
+else
+    cp "$TMP_ENV" "$ENV_FILE"
+    echo "→ Resolved ${RESOLVED} placeholder(s) in .env"
+fi
 
-# Uncomment to enable:
-# if grep -q '<vaultwarden:' "$ENV_FILE" 2>/dev/null; then
-#   echo "→ VaultWarden placeholders found - run manually:"
-#   echo "   ./scripts/resolve-vaultwarden.sh --in-place"
-# fi
+if [[ "$FAILED" -gt 0 ]]; then
+    echo "→ ${FAILED} placeholder(s) could not be resolved."
+    echo "  Create the items in Bitwarden and re-run this script."
+    exit 1
+fi
diff --git a/start.sh b/start.sh
index 0e9d4f3..3629a91 100755
--- a/start.sh
+++ b/start.sh
@@ -31,9 +31,12 @@ if [[ ! -f .env ]]; then
   exit 1
 fi
 
-# ── 2. Generate olla.yaml from .env ──────────────────────────────────────────
+# ── 2. Resolve VaultWarden placeholders (if any) ──────────────────────
+bash "${SCRIPT_DIR}/scripts/resolve-vaultwarden.sh"
+
+# ── 3. Generate olla.yaml from .env ───────────────────────────────────
 bash "${SCRIPT_DIR}/scripts/generate-olla-config.sh"
 
-# ── 3. Start the stack ────────────────────────────────────────────────
+# ── 4. Start the stack ────────────────────────────────────────────────
 echo "→ Starting stack..."
 docker compose up "$@"
diff --git a/systemd/ai-stack.service b/systemd/ai-stack.service
index b4fd47d..e82a019 100644
--- a/systemd/ai-stack.service
+++ b/systemd/ai-stack.service
@@ -1,5 +1,5 @@
 [Unit]
-Description=AI Stack (Ollama Intel Arc + Open WebUI + Pipelines + Open Terminal)
+Description=AI Stack (Ollama Intel Arc + LiteLLM + Olla + Router + Retriever)
 After=docker.service
 Requires=docker.service
 
diff --git a/tests/test_smart_model_router.py b/tests/test_smart_model_router.py
deleted file mode 100644
index af97912..0000000
--- a/tests/test_smart_model_router.py
+++ /dev/null
@@ -1,422 +0,0 @@
-"""
-Tests for pipelines/smart_model_router.py
-
-Covers:
-- Pipeline initialisation and Valves defaults
-- _classify: every pattern category (diagnostics, scripting, reasoning, longform, default)
-- _classify: case-insensitivity
-- inlet: normal routing, empty body, no user message, mixed role messages
-- inlet: debug-mode system-message injection (prepend vs insert)
-- inlet: custom Valve overrides
-"""
-
-import sys
-import os
-import pytest
-
-# Allow importing the pipeline module without Open-WebUI installed
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "pipelines"))
-from smart_model_router import Pipeline  # noqa: E402
-
-
-# ─── Fixtures ────────────────────────────────────────────────────────────────
-
-@pytest.fixture()
-def pipeline():
-    return Pipeline()
-
-
-# ─── Initialisation ──────────────────────────────────────────────────────────
-
-class TestPipelineInit:
-    def test_name(self, pipeline):
-        assert pipeline.name == "Smart Model Router"
-
-    def test_type(self, pipeline):
-        assert pipeline.type == "filter"
-
-    def test_id(self, pipeline):
-        assert pipeline.id == "smart_model_router"
-
-    def test_default_diagnostics_model(self, pipeline):
-        assert pipeline.valves.diagnostics_model == "qwen2.5:14b"
-
-    def test_default_scripting_model(self, pipeline):
-        assert pipeline.valves.scripting_model == "qwen2.5-coder:14b"
-
-    def test_default_reasoning_model(self, pipeline):
-        assert pipeline.valves.reasoning_model == "deepseek-r1:14b"
-
-    def test_default_longform_model(self, pipeline):
-        assert pipeline.valves.longform_model == "gemma3:12b"
-
-    def test_debug_off_by_default(self, pipeline):
-        assert pipeline.valves.debug is False
-
-
-# ─── _classify ───────────────────────────────────────────────────────────────
-
-class TestClassifyDiagnostics:
-    """Tests for queries that should route to the diagnostics model.
-
-    The diagnostics patterns are evaluated first, so any query containing
-    a diagnostic keyword will route here regardless of other keywords.
-    """
-
-    @pytest.mark.parametrize("text", [
-        # health / status / check
-        "check health of the system",
-        "what is the status of the server",
-        "monitor resource usage",
-        "send an alert when a threshold is reached",
-        "is the host reachable?",
-        "is nginx up?",
-        "is redis down?",
-        "ping the gateway",
-        "show uptime stats",
-        # service / container keywords
-        "ollama is not responding",
-        "open webui is crashing",
-        "the pipeline crashed",
-        "docker container won't start",
-        # hardware keywords
-        "check gpu load",
-        "how much cpu is in use?",
-        "show memory usage",
-        "disk usage report",
-        # API shortcuts
-        "get_all instances",
-        "list models on this host",
-        "loaded models in vram",
-        "show loaded models",
-        # misc
-        "what is the uptime?",
-        "is the container unreachable?",
-    ])
-    def test_routes_to_diagnostics(self, pipeline, text):
-        model, reason = pipeline._classify(text)
-        assert model == pipeline.valves.diagnostics_model
-        assert reason == "diagnostics"
-
-
-class TestClassifyScripting:
-    """Tests for queries that route to the scripting / code model.
-
-    Note: The diagnostics patterns run first.  Inputs that also contain
-    diagnostic keywords (e.g. 'docker', 'container') will match diagnostics
-    instead — those cases are covered in TestRoutingPriority below.
-    """
-
-    @pytest.mark.parametrize("text", [
-        "write a bash script to restart nginx",
-        "shell command to list open ports",
-        "create a cron job for backups",
-        "configure a systemd unit file",
-        "write a Dockerfile",
-        "ansible playbook for deployment",
-        "terraform config for AWS",
-        "how do I fix this error in Python?",
-        "debug this traceback",
-        "exception thrown in the handler",
-        "process failed with exit code 1",
-        "setup the database",
-        "configure nginx reverse proxy",
-        "deploy the application",
-        "update the package list",
-        "upgrade the kernel",
-        "write a Python function",
-        "JavaScript class for authentication",
-        "TypeScript interface example",
-        "import the module",
-        "code review for this snippet",
-        "show me the yaml syntax",
-    ])
-    def test_routes_to_scripting(self, pipeline, text):
-        model, reason = pipeline._classify(text)
-        assert model == pipeline.valves.scripting_model
-        assert reason == "scripting"
-
-
-class TestClassifyReasoning:
-    """Tests for queries that route to the reasoning model.
-
-    Inputs must not contain diagnostic keywords (which run earlier)
-    or scripting keywords.
-    """
-
-    @pytest.mark.parametrize("text", [
-        "root cause of the crash",
-        "explain how kubernetes works",
-        "compare Redis and Memcached",
-        "optimize this query for performance",
-        "recommend a caching strategy",
-        "should I use nginx or apache?",
-        "what would you suggest for scaling?",
-        "best approach for zero-downtime deploys",
-        "pros and cons of microservices",
-        "high latency in network requests",
-        "architecture of a distributed system",
-        "design the API layer",
-        "strategy for data migration",
-        "best practice for secrets management",
-        "tradeoff between SQL and NoSQL",
-    ])
-    def test_routes_to_reasoning(self, pipeline, text):
-        model, reason = pipeline._classify(text)
-        assert model == pipeline.valves.reasoning_model
-        assert reason == "reasoning"
-
-
-class TestClassifyLongform:
-    """Tests for queries that route to the longform model.
-
-    Inputs must not contain diagnostic, scripting, or reasoning keywords,
-    as those categories are evaluated first.
-    """
-
-    @pytest.mark.parametrize("text", [
-        "show me the log",
-        "summarize the output",
-        "give me a summary",
-        "document this module",
-        "what does this mean?",
-        "step by step guide",
-        "write a blog post",
-        "draft a proposal",
-        "create a document for the team",
-        "generate a report for management",
-    ])
-    def test_routes_to_longform(self, pipeline, text):
-        model, reason = pipeline._classify(text)
-        assert model == pipeline.valves.longform_model
-        assert reason == "longform"
-
-
-class TestRoutingPriority:
-    """
-    Documents cases where an earlier pattern category wins over a later one
-    when both keywords appear in the same query.  These are intentional
-    observations of the priority ordering: diagnostics > scripting >
-    reasoning > longform.
-    """
-
-    @pytest.mark.parametrize("text,expected_reason", [
-        # 'docker' (diagnostics) wins over 'install' / 'yaml' (scripting)
-        ("show me the yaml for docker compose", "diagnostics"),
-        ("install docker on Ubuntu",            "diagnostics"),
-        # 'cpu' / 'memory' (diagnostics) wins over reasoning keywords
-        ("why is the CPU spiking?",             "diagnostics"),
-        ("possible memory leak",                "diagnostics"),
-        ("high cpu on the worker",              "diagnostics"),
-        # 'service' (scripting) wins over 'slow' (reasoning)
-        ("the service is slow",                 "scripting"),
-        # 'analyze' (reasoning) wins over 'logs' (longform)
-        ("analyze these logs",                  "reasoning"),
-        # 'setup' (scripting) wins over 'walk me through' (longform)
-        ("walk me through the setup",           "scripting"),
-        # 'error' (scripting) wins over 'explain this' (longform)
-        ("explain this error message",          "scripting"),
-        # 'diagnos' stem requires exact word boundary — 'diagnose' is default
-        ("diagnose the problem",                "default"),
-    ])
-    def test_priority_winner(self, pipeline, text, expected_reason):
-        _, reason = pipeline._classify(text)
-        assert reason == expected_reason
-
-
-class TestClassifyDefault:
-    @pytest.mark.parametrize("text", [
-        "hello",
-        "what time is it?",
-        "thanks",
-        "ok",
-        "tell me something interesting",
-        "random question with no keywords",
-    ])
-    def test_routes_to_default(self, pipeline, text):
-        model, reason = pipeline._classify(text)
-        assert model == pipeline.valves.diagnostics_model
-        assert reason == "default"
-
-    def test_empty_string(self, pipeline):
-        model, reason = pipeline._classify("")
-        assert model == pipeline.valves.diagnostics_model
-        assert reason == "default"
-
-
-class TestClassifyCaseInsensitivity:
-    def test_uppercase_diagnostic(self, pipeline):
-        model, reason = pipeline._classify("CHECK HEALTH")
-        assert reason == "diagnostics"
-
-    def test_mixed_case_scripting(self, pipeline):
-        model, reason = pipeline._classify("Write A BASH Script")
-        assert reason == "scripting"
-
-    def test_uppercase_reasoning(self, pipeline):
-        model, reason = pipeline._classify("WHY IS IT SLOW?")
-        assert reason == "reasoning"
-
-    def test_uppercase_longform(self, pipeline):
-        model, reason = pipeline._classify("SUMMARIZE THE LOGS")
-        assert reason == "longform"
-
-
-class TestClassifyCustomValves:
-    def test_custom_diagnostics_model_returned(self):
-        p = Pipeline()
-        p.valves.diagnostics_model = "custom-diag:7b"
-        model, _ = p._classify("check health")
-        assert model == "custom-diag:7b"
-
-    def test_custom_scripting_model_returned(self):
-        p = Pipeline()
-        p.valves.scripting_model = "custom-code:7b"
-        model, _ = p._classify("write a bash script")
-        assert model == "custom-code:7b"
-
-    def test_custom_reasoning_model_returned(self):
-        p = Pipeline()
-        p.valves.reasoning_model = "custom-reason:7b"
-        model, _ = p._classify("why is it slow?")
-        assert model == "custom-reason:7b"
-
-    def test_custom_longform_model_returned(self):
-        p = Pipeline()
-        p.valves.longform_model = "custom-long:7b"
-        model, _ = p._classify("summarize the logs")
-        assert model == "custom-long:7b"
-
-
-# ─── inlet ───────────────────────────────────────────────────────────────────
-
-class TestInlet:
-    @pytest.mark.asyncio
-    async def test_empty_messages_returns_body_unchanged(self, pipeline):
-        body = {"messages": []}
-        result = await pipeline.inlet(body)
-        assert result == {"messages": []}
-
-    @pytest.mark.asyncio
-    async def test_missing_messages_key_returns_body_unchanged(self, pipeline):
-        body = {"model": "some-model"}
-        result = await pipeline.inlet(body)
-        assert result == {"model": "some-model"}
-
-    @pytest.mark.asyncio
-    async def test_no_user_message_returns_body_unchanged(self, pipeline):
-        body = {"messages": [{"role": "system", "content": "You are helpful."}]}
-        result = await pipeline.inlet(body)
-        # model should not be overridden because there is no user message
-        assert "model" not in result
-
-    @pytest.mark.asyncio
-    async def test_routes_user_message_to_correct_model(self, pipeline):
-        body = {
-            "messages": [
-                {"role": "user", "content": "check health of ollama"}
-            ]
-        }
-        result = await pipeline.inlet(body)
-        assert result["model"] == pipeline.valves.diagnostics_model
-
-    @pytest.mark.asyncio
-    async def test_uses_last_user_message(self, pipeline):
-        body = {
-            "messages": [
-                {"role": "user", "content": "summarize these logs"},
-                {"role": "assistant", "content": "Here is the summary."},
-                {"role": "user", "content": "write a bash script"},
-            ]
-        }
-        result = await pipeline.inlet(body)
-        # Last user message matches scripting
-        assert result["model"] == pipeline.valves.scripting_model
-
-    @pytest.mark.asyncio
-    async def test_skips_assistant_messages(self, pipeline):
-        body = {
-            "messages": [
-                {"role": "assistant", "content": "diagnose the issue"},
-                {"role": "user", "content": "why is it slow?"},
-            ]
-        }
-        result = await pipeline.inlet(body)
-        assert result["model"] == pipeline.valves.reasoning_model
-
-    @pytest.mark.asyncio
-    async def test_passes_user_kwarg(self, pipeline):
-        """inlet should work when user dict is supplied."""
-        body = {"messages": [{"role": "user", "content": "check gpu"}]}
-        result = await pipeline.inlet(body, user={"id": "abc", "name": "Alice"})
-        assert result["model"] == pipeline.valves.diagnostics_model
-
-    @pytest.mark.asyncio
-    async def test_empty_user_content_returns_body_unchanged(self, pipeline):
-        body = {"messages": [{"role": "user", "content": ""}]}
-        result = await pipeline.inlet(body)
-        assert "model" not in result
-
-
-class TestInletDebugMode:
-    @pytest.mark.asyncio
-    async def test_debug_prepends_to_existing_system_message(self):
-        p = Pipeline()
-        p.valves.debug = True
-        body = {
-            "messages": [
-                {"role": "system", "content": "You are helpful."},
-                {"role": "user", "content": "check health"},
-            ]
-        }
-        result = await p.inlet(body)
-        system_content = result["messages"][0]["content"]
-        assert system_content.startswith("[Router →")
-        assert "You are helpful." in system_content
-
-    @pytest.mark.asyncio
-    async def test_debug_inserts_system_message_when_none_exists(self):
-        p = Pipeline()
-        p.valves.debug = True
-        body = {
-            "messages": [
-                {"role": "user", "content": "check health"},
-            ]
-        }
-        result = await p.inlet(body)
-        assert result["messages"][0]["role"] == "system"
-        assert result["messages"][0]["content"].startswith("[Router →")
-
-    @pytest.mark.asyncio
-    async def test_debug_includes_reason_in_system_message(self):
-        p = Pipeline()
-        p.valves.debug = True
-        body = {"messages": [{"role": "user", "content": "write a script"}]}
-        result = await p.inlet(body)
-        # The system message should contain the routing reason
-        system_content = result["messages"][0]["content"]
-        assert "scripting" in system_content
-
-    @pytest.mark.asyncio
-    async def test_no_debug_does_not_insert_system_message(self, pipeline):
-        body = {"messages": [{"role": "user", "content": "check health"}]}
-        result = await pipeline.inlet(body)
-        # No system message should be injected in non-debug mode
-        roles = [m["role"] for m in result["messages"]]
-        assert "system" not in roles
-
-
-# ─── Lifecycle hooks ─────────────────────────────────────────────────────────
-
-class TestLifecycle:
-    @pytest.mark.asyncio
-    async def test_on_startup(self, pipeline, capsys):
-        await pipeline.on_startup()
-        captured = capsys.readouterr()
-        assert "Pipeline started" in captured.out
-
-    @pytest.mark.asyncio
-    async def test_on_shutdown(self, pipeline, capsys):
-        await pipeline.on_shutdown()
-        captured = capsys.readouterr()
-        assert "Pipeline stopped" in captured.out
diff --git a/tests/test_system_diagnostics.py b/tests/test_system_diagnostics.py
deleted file mode 100644
index 53162e0..0000000
--- a/tests/test_system_diagnostics.py
+++ /dev/null
@@ -1,483 +0,0 @@
-"""
-Tests for tools/system_diagnostics.py
-
-Covers:
-- Tools initialization
-- _instance_url: valid / invalid instance
-- check_health: reachable, unreachable (network error), non-200 status
-- check_all_instances: all reachable, mixed, all unreachable
-- list_all_models: success, network error
-- list_loaded_models: success, network error
-- show_model_info: success, network error, unknown instance
-- free_model: success (200), failure (non-200), network error, unknown instance
-- get_all: reachable with loaded models, unreachable, ps error
-"""
-
-import sys
-import os
-import json
-import pytest
-from unittest.mock import AsyncMock, MagicMock, patch
-import httpx
-
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "tools"))
-from system_diagnostics import Tools, OLLAMA_INSTANCES  # noqa: E402
-
-
-# ─── Helpers ─────────────────────────────────────────────────────────────────
-
-def make_response(status_code: int, body: dict) -> MagicMock:
-    """Create a mock httpx.Response."""
-    r = MagicMock(spec=httpx.Response)
-    r.status_code = status_code
-    r.json.return_value = body
-    return r
-
-
-# ─── Fixtures ────────────────────────────────────────────────────────────────
-
-@pytest.fixture()
-def tools():
-    t = Tools()
-    t.instances = {
-        "local": "http://localhost:11434",
-        "remote1": "http://10.0.0.1:11434",
-    }
-    return t
-
-
-# ─── Initialization ──────────────────────────────────────────────────────────
-
-class TestInit:
-    def test_instances_loaded_from_module_constant(self):
-        t = Tools()
-        assert t.instances == OLLAMA_INSTANCES
-
-    def test_local_instance_always_present(self):
-        t = Tools()
-        assert "local" in t.instances
-
-
-# ─── _instance_url ────────────────────────────────────────────────────────────
-
-class TestInstanceUrl:
-    def test_known_instance_returns_url(self, tools):
-        url, err = tools._instance_url("local")
-        assert url == "http://localhost:11434"
-        assert err is None
-
-    def test_second_known_instance(self, tools):
-        url, err = tools._instance_url("remote1")
-        assert url == "http://10.0.0.1:11434"
-        assert err is None
-
-    def test_unknown_instance_returns_error(self, tools):
-        url, err = tools._instance_url("nonexistent")
-        assert url is None
-        assert "nonexistent" in err
-        assert "local" in err   # lists available instances
-
-    def test_empty_string_instance(self, tools):
-        url, err = tools._instance_url("")
-        assert url is None
-        assert err is not None
-
-
-# ─── check_health ────────────────────────────────────────────────────────────
-
-class TestCheckHealth:
-    @pytest.mark.asyncio
-    async def test_reachable_instance(self, tools):
-        resp = make_response(200, {"models": [{"name": "m1"}, {"name": "m2"}]})
-        mock_client = AsyncMock()
-        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-        mock_client.__aexit__ = AsyncMock(return_value=False)
-        mock_client.get = AsyncMock(return_value=resp)
-
-        with patch("httpx.AsyncClient", return_value=mock_client):
-            result = json.loads(await tools.check_health("local"))
-
-        assert result["status"] == "reachable"
-        assert result["instance"] == "local"
-        assert result["model_count"] == 2
-        assert result["http_code"] == 200
-        assert result["url"] == "http://localhost:11434"
-
-    @pytest.mark.asyncio
-    async def test_unreachable_instance(self, tools):
-        mock_client = AsyncMock()
-        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-        mock_client.__aexit__ = AsyncMock(return_value=False)
-        mock_client.get = AsyncMock(side_effect=httpx.ConnectError("refused"))
-
-        with patch("httpx.AsyncClient", return_value=mock_client):
-            result = json.loads(await tools.check_health("local"))
-
-        assert result["status"] == "unreachable"
-        assert "error" in result
-
-    @pytest.mark.asyncio
-    async def test_unknown_instance_returns_error_json(self, tools):
-        result = json.loads(await tools.check_health("ghost"))
-        assert "error" in result
-        assert "ghost" in result["error"]
-
-    @pytest.mark.asyncio
-    async def test_empty_models_list(self, tools):
-        resp = make_response(200, {"models": []})
-        mock_client = AsyncMock()
-        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-        mock_client.__aexit__ = AsyncMock(return_value=False)
-        mock_client.get = AsyncMock(return_value=resp)
-
-        with patch("httpx.AsyncClient", return_value=mock_client):
-            result = json.loads(await tools.check_health("local"))
-
-        assert result["model_count"] == 0
-        assert result["status"] == "reachable"
-
-    @pytest.mark.asyncio
-    async def test_default_instance_is_local(self, tools):
-        """check_health() with no argument should check 'local'."""
-        resp = make_response(200, {"models": []})
-        mock_client = AsyncMock()
-        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-        mock_client.__aexit__ = AsyncMock(return_value=False)
-        mock_client.get = AsyncMock(return_value=resp)
-
-        with patch("httpx.AsyncClient", return_value=mock_client):
-            result = json.loads(await tools.check_health())
-
-        assert result["instance"] == "local"
-
-
-# ─── check_all_instances ─────────────────────────────────────────────────────
-
-class TestCheckAllInstances:
-    @pytest.mark.asyncio
-    async def test_all_reachable(self, tools):
-        resp = make_response(200, {"models": [{"name": "x"}]})
-        mock_client = AsyncMock()
-        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-        mock_client.__aexit__ = AsyncMock(return_value=False)
-        mock_client.get = AsyncMock(return_value=resp)
-
-        with patch("httpx.AsyncClient", return_value=mock_client):
-            result = json.loads(await tools.check_all_instances())
-
-        assert result["local"]["status"] == "reachable"
-        assert result["remote1"]["status"] == "reachable"
-
-    @pytest.mark.asyncio
-    async def test_all_unreachable(self, tools):
-        mock_client = AsyncMock()
-        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-        mock_client.__aexit__ = AsyncMock(return_value=False)
-        mock_client.get = AsyncMock(side_effect=httpx.ConnectError("refused"))
-
-        with patch("httpx.AsyncClient", return_value=mock_client):
-            result = json.loads(await tools.check_all_instances())
-
-        for name in tools.instances:
-            assert result[name]["status"] == "unreachable"
-            assert "error" in result[name]
-
-    @pytest.mark.asyncio
-    async def test_mixed_reachability(self, tools):
-        reachable = make_response(200, {"models": []})
-
-        call_count = {"n": 0}
-
-        async def selective_get(url, **kwargs):
-            call_count["n"] += 1
-            if "localhost" in url:
-                return reachable
-            raise httpx.ConnectError("refused")
-
-        mock_client = AsyncMock()
-        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-        mock_client.__aexit__ = AsyncMock(return_value=False)
-        mock_client.get = selective_get
-
-        with patch("httpx.AsyncClient", return_value=mock_client):
-            result = json.loads(await tools.check_all_instances())
-
-        assert result["local"]["status"] == "reachable"
-        assert result["remote1"]["status"] == "unreachable"
-
-    @pytest.mark.asyncio
-    async def test_returns_url_for_all_instances(self, tools):
-        resp = make_response(200, {"models": []})
-        mock_client = AsyncMock()
-        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-        mock_client.__aexit__ = AsyncMock(return_value=False)
-        mock_client.get = AsyncMock(return_value=resp)
-
-        with patch("httpx.AsyncClient", return_value=mock_client):
-            result = json.loads(await tools.check_all_instances())
-
-        for name, url in tools.instances.items():
-            assert result[name]["url"] == url
-
-
-# ─── list_all_models ──────────────────────────────────────────────────────────
-
-class TestListAllModels:
-    @pytest.mark.asyncio
-    async def test_success(self, tools):
-        models_body = {"models": [{"name": "llama3"}, {"name": "phi3"}]}
-        resp = make_response(200, models_body)
-        mock_client = AsyncMock()
-        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-        mock_client.__aexit__ = AsyncMock(return_value=False)
-        mock_client.get = AsyncMock(return_value=resp)
-
-        with patch("httpx.AsyncClient", return_value=mock_client):
-            result = json.loads(await tools.list_all_models("local"))
-
-        assert result["instance"] == "local"
-        assert result["data"] == models_body
-
-    @pytest.mark.asyncio
-    async def test_network_error(self, tools):
-        mock_client = AsyncMock()
-        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-        mock_client.__aexit__ = AsyncMock(return_value=False)
-        mock_client.get = AsyncMock(side_effect=httpx.TimeoutException("timeout"))
-
-        with patch("httpx.AsyncClient", return_value=mock_client):
-            result = json.loads(await tools.list_all_models("local"))
-
-        assert "error" in result
-
-    @pytest.mark.asyncio
-    async def test_unknown_instance(self, tools):
-        result = json.loads(await tools.list_all_models("ghost"))
-        assert "error" in result
-
-
-# ─── list_loaded_models ───────────────────────────────────────────────────────
-
-class TestListLoadedModels:
-    @pytest.mark.asyncio
-    async def test_success(self, tools):
-        ps_body = {"models": [{"name": "llama3", "size": 8000000000}]}
-        resp = make_response(200, ps_body)
-        mock_client = AsyncMock()
-        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-        mock_client.__aexit__ = AsyncMock(return_value=False)
-        mock_client.get = AsyncMock(return_value=resp)
-
-        with patch("httpx.AsyncClient", return_value=mock_client):
-            result = json.loads(await tools.list_loaded_models("local"))
-
-        assert result["instance"] == "local"
-        assert result["data"] == ps_body
-
-    @pytest.mark.asyncio
-    async def test_network_error(self, tools):
-        mock_client = AsyncMock()
-        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-        mock_client.__aexit__ = AsyncMock(return_value=False)
-        mock_client.get = AsyncMock(side_effect=httpx.ConnectError("refused"))
-
-        with patch("httpx.AsyncClient", return_value=mock_client):
-            result = json.loads(await tools.list_loaded_models("local"))
-
-        assert "error" in result
-
-    @pytest.mark.asyncio
-    async def test_unknown_instance(self, tools):
-        result = json.loads(await tools.list_loaded_models("nowhere"))
-        assert "error" in result
-
-
-# ─── show_model_info ──────────────────────────────────────────────────────────
-
-class TestShowModelInfo:
-    @pytest.mark.asyncio
-    async def test_success(self, tools):
-        info_body = {"modelfile": "FROM llama3", "parameters": {}}
-        resp = make_response(200, info_body)
-        mock_client = AsyncMock()
-        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-        mock_client.__aexit__ = AsyncMock(return_value=False)
-        mock_client.post = AsyncMock(return_value=resp)
-
-        with patch("httpx.AsyncClient", return_value=mock_client):
-            result = json.loads(await tools.show_model_info("local", "llama3"))
-
-        assert result["instance"] == "local"
-        assert result["model"] == "llama3"
-        assert result["data"] == info_body
-
-    @pytest.mark.asyncio
-    async def test_network_error(self, tools):
-        mock_client = AsyncMock()
-        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-        mock_client.__aexit__ = AsyncMock(return_value=False)
-        mock_client.post = AsyncMock(side_effect=httpx.ConnectError("refused"))
-
-        with patch("httpx.AsyncClient", return_value=mock_client):
-            result = json.loads(await tools.show_model_info("local", "llama3"))
-
-        assert "error" in result
-        assert result["model"] == "llama3"
-
-    @pytest.mark.asyncio
-    async def test_unknown_instance(self, tools):
-        result = json.loads(await tools.show_model_info("ghost", "llama3"))
-        assert "error" in result
-
-
-# ─── free_model ───────────────────────────────────────────────────────────────
-
-class TestFreeModel:
-    @pytest.mark.asyncio
-    async def test_success_200(self, tools):
-        resp = make_response(200, {})
-        mock_client = AsyncMock()
-        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-        mock_client.__aexit__ = AsyncMock(return_value=False)
-        mock_client.post = AsyncMock(return_value=resp)
-
-        with patch("httpx.AsyncClient", return_value=mock_client):
-            result = json.loads(await tools.free_model("local", "llama3"))
-
-        assert result["status"] == "unloaded"
-        assert result["http_code"] == 200
-
-    @pytest.mark.asyncio
-    async def test_failure_non_200(self, tools):
-        resp = make_response(500, {})
-        mock_client = AsyncMock()
-        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-        mock_client.__aexit__ = AsyncMock(return_value=False)
-        mock_client.post = AsyncMock(return_value=resp)
-
-        with patch("httpx.AsyncClient", return_value=mock_client):
-            result = json.loads(await tools.free_model("local", "llama3"))
-
-        assert result["status"] == "failed"
-        assert result["http_code"] == 500
-
-    @pytest.mark.asyncio
-    async def test_network_error(self, tools):
-        mock_client = AsyncMock()
-        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-        mock_client.__aexit__ = AsyncMock(return_value=False)
-        mock_client.post = AsyncMock(side_effect=httpx.ConnectError("refused"))
-
-        with patch("httpx.AsyncClient", return_value=mock_client):
-            result = json.loads(await tools.free_model("local", "llama3"))
-
-        assert "error" in result
-
-    @pytest.mark.asyncio
-    async def test_unknown_instance(self, tools):
-        result = json.loads(await tools.free_model("ghost", "llama3"))
-        assert "error" in result
-
-    @pytest.mark.asyncio
-    async def test_sends_keep_alive_zero(self, tools):
-        """free_model must send keep_alive=0 to unload the model."""
-        resp = make_response(200, {})
-        mock_client = AsyncMock()
-        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-        mock_client.__aexit__ = AsyncMock(return_value=False)
-        mock_client.post = AsyncMock(return_value=resp)
-
-        with patch("httpx.AsyncClient", return_value=mock_client):
-            await tools.free_model("local", "llama3")
-
-        call_kwargs = mock_client.post.call_args
-        assert call_kwargs[1]["json"]["keep_alive"] == 0
-        assert call_kwargs[1]["json"]["model"] == "llama3"
-
-
-# ─── get_all ─────────────────────────────────────────────────────────────────
-
-class TestGetAll:
-    @pytest.mark.asyncio
-    async def test_all_reachable_with_loaded_models(self, tools):
-        tags_body = {"models": [{"name": "m1"}, {"name": "m2"}]}
-        ps_body = {"models": [{"name": "m1"}]}
-
-        async def get_side_effect(url, **kwargs):
-            if "/api/tags" in url:
-                return make_response(200, tags_body)
-            if "/api/ps" in url:
-                return make_response(200, ps_body)
-
-        mock_client = AsyncMock()
-        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-        mock_client.__aexit__ = AsyncMock(return_value=False)
-        mock_client.get = get_side_effect
-
-        with patch("httpx.AsyncClient", return_value=mock_client):
-            result = json.loads(await tools.get_all())
-
-        for name in tools.instances:
-            assert result[name]["status"] == "reachable"
-            assert result[name]["model_count"] == 2
-            assert result[name]["loaded_models"] == [{"name": "m1"}]
-
-    @pytest.mark.asyncio
-    async def test_unreachable_skips_ps_call(self, tools):
-        """When /api/tags fails, the entry is marked unreachable and /api/ps is not called."""
-        mock_client = AsyncMock()
-        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-        mock_client.__aexit__ = AsyncMock(return_value=False)
-        mock_client.get = AsyncMock(side_effect=httpx.ConnectError("refused"))
-
-        with patch("httpx.AsyncClient", return_value=mock_client):
-            result = json.loads(await tools.get_all())
-
-        for name in tools.instances:
-            assert result[name]["status"] == "unreachable"
-            assert "error" in result[name]
-            assert "loaded_models" not in result[name]
-
-    @pytest.mark.asyncio
-    async def test_ps_error_recorded(self, tools):
-        """When /api/tags succeeds but /api/ps fails, loaded_models_error is recorded."""
-        tags_body = {"models": []}
-
-        async def get_side_effect(url, **kwargs):
-            if "/api/tags" in url:
-                return make_response(200, tags_body)
-            raise httpx.ConnectError("ps refused")
-
-        mock_client = AsyncMock()
-        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-        mock_client.__aexit__ = AsyncMock(return_value=False)
-        mock_client.get = get_side_effect
-
-        with patch("httpx.AsyncClient", return_value=mock_client):
-            result = json.loads(await tools.get_all())
-
-        for name in tools.instances:
-            assert result[name]["status"] == "reachable"
-            assert "loaded_models_error" in result[name]
-            assert "loaded_models" not in result[name]
-
-    @pytest.mark.asyncio
-    async def test_includes_url_for_all_instances(self, tools):
-        tags_body = {"models": []}
-        ps_body = {"models": []}
-
-        async def get_side_effect(url, **kwargs):
-            if "/api/tags" in url:
-                return make_response(200, tags_body)
-            return make_response(200, ps_body)
-
-        mock_client = AsyncMock()
-        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-        mock_client.__aexit__ = AsyncMock(return_value=False)
-        mock_client.get = get_side_effect
-
-        with patch("httpx.AsyncClient", return_value=mock_client):
-            result = json.loads(await tools.get_all())
-
-        for name, url in tools.instances.items():
-            assert result[name]["url"] == url
diff --git a/tools/system_diagnostics.py b/tools/system_diagnostics.py
deleted file mode 100644
index 5a4653c..0000000
--- a/tools/system_diagnostics.py
+++ /dev/null
@@ -1,170 +0,0 @@
-"""
-title: System Diagnostics
-description: Query multiple Ollama instances for models, GPU status, health, and control.
-             Instances are configured via OLLAMA_INSTANCES below.
-             Add remote machines by adding OLLAMA_REMOTE_<name>=http://<ip>:11434
-             to your .env and re-running post-install.sh, or edit OLLAMA_INSTANCES directly.
-version: 1.1.0
-"""
-
-import httpx
-import json
-
-# ─── Configure your Ollama instances here ────────────────────────────────────
-# The local instance is always present.
-# Add remote machines to match your OLLAMA_REMOTE_* entries in .env.
-# post-install.sh will generate this block automatically if you use that workflow.
-OLLAMA_INSTANCES = {
-    "local": "http://ollama-arc:11434",
-}
-
-
-class Tools:
-    def __init__(self):
-        self.instances = OLLAMA_INSTANCES
-
-    def _instance_url(self, instance: str) -> tuple[str, str | None]:
-        """Return (url, error) for a named instance."""
-        url = self.instances.get(instance)
-        if not url:
-            available = ", ".join(self.instances.keys())
-            return None, f"Unknown instance '{instance}'. Available: {available}"
-        return url, None
-
-    async def check_health(self, instance: str = "local") -> str:
-        """Ping an Ollama instance to confirm it is reachable. instance: local | remote1 | remote2 ..."""
-        url, err = self._instance_url(instance)
-        if err:
-            return json.dumps({"error": err})
-        try:
-            async with httpx.AsyncClient(timeout=5) as client:
-                r = await client.get(f"{url}/api/tags")
-                return json.dumps({
-                    "instance": instance,
-                    "url": url,
-                    "status": "reachable",
-                    "http_code": r.status_code,
-                    "model_count": len(r.json().get("models", [])),
-                }, indent=2)
-        except Exception as e:
-            return json.dumps({
-                "instance": instance,
-                "url": url,
-                "status": "unreachable",
-                "error": str(e),
-            })
-
-    async def check_all_instances(self) -> str:
-        """Ping every configured Ollama instance and return a health summary."""
-        results = {}
-        async with httpx.AsyncClient(timeout=5) as client:
-            for name, url in self.instances.items():
-                try:
-                    r = await client.get(f"{url}/api/tags")
-                    results[name] = {
-                        "status": "reachable",
-                        "url": url,
-                        "http_code": r.status_code,
-                        "model_count": len(r.json().get("models", [])),
-                    }
-                except Exception as e:
-                    results[name] = {
-                        "status": "unreachable",
-                        "url": url,
-                        "error": str(e),
-                    }
-        return json.dumps(results, indent=2)
-
-    async def list_all_models(self, instance: str = "local") -> str:
-        """List all available models with sizes and quantization. instance: local | remote1"""
-        url, err = self._instance_url(instance)
-        if err:
-            return json.dumps({"error": err})
-        try:
-            async with httpx.AsyncClient(timeout=10) as client:
-                r = await client.get(f"{url}/api/tags")
-                return json.dumps({
-                    "instance": instance,
-                    "url": url,
-                    "data": r.json()
-                }, indent=2)
-        except Exception as e:
-            return json.dumps({"instance": instance, "url": url, "error": str(e)})
-
-    async def list_loaded_models(self, instance: str = "local") -> str:
-        """List currently loaded models and VRAM/RAM usage. instance: local | remote1 ..."""
-        url, err = self._instance_url(instance)
-        if err:
-            return json.dumps({"error": err})
-        try:
-            async with httpx.AsyncClient(timeout=10) as client:
-                r = await client.get(f"{url}/api/ps")
-                return json.dumps({
-                    "instance": instance,
-                    "url": url,
-                    "data": r.json()
-                }, indent=2)
-        except Exception as e:
-            return json.dumps({"instance": instance, "url": url, "error": str(e)})
-
-    async def show_model_info(self, instance: str, model_name: str) -> str:
-        """Get modelfile, parameters, and template for a specific model. instance: local | remote1 ..."""
-        url, err = self._instance_url(instance)
-        if err:
-            return json.dumps({"error": err})
-        try:
-            async with httpx.AsyncClient(timeout=10) as client:
-                r = await client.post(f"{url}/api/show", json={"name": model_name})
-                return json.dumps({
-                    "instance": instance,
-                    "model": model_name,
-                    "data": r.json()
-                }, indent=2)
-        except Exception as e:
-            return json.dumps({"instance": instance, "model": model_name, "error": str(e)})
-
-    async def free_model(self, instance: str, model_name: str) -> str:
-        """Unload a model from VRAM/RAM on a given instance. instance: local | remote1 ..."""
-        url, err = self._instance_url(instance)
-        if err:
-            return json.dumps({"error": err})
-        try:
-            async with httpx.AsyncClient(timeout=10) as client:
-                r = await client.post(
-                    f"{url}/api/generate",
-                    json={"model": model_name, "keep_alive": 0}
-                )
-                return json.dumps({
-                    "instance": instance,
-                    "model": model_name,
-                    "status": "unloaded" if r.status_code == 200 else "failed",
-                    "http_code": r.status_code,
-                }, indent=2)
-        except Exception as e:
-            return json.dumps({"instance": instance, "model": model_name, "error": str(e)})
-
-    async def get_all(self) -> str:
-        """Full diagnostic report: health, loaded models, and model lists for all instances."""
-        results = {}
-        async with httpx.AsyncClient(timeout=10) as client:
-            for name, url in self.instances.items():
-                entry = {"url": url}
-                try:
-                    health = await client.get(f"{url}/api/tags")
-                    tags = health.json()
-                    entry["status"] = "reachable"
-                    entry["http_code"] = health.status_code
-                    entry["model_count"] = len(tags.get("models", []))
-                    entry["models"] = tags.get("models", [])
-                except Exception as e:
-                    entry["status"] = "unreachable"
-                    entry["error"] = str(e)
-                    results[name] = entry
-                    continue
-                try:
-                    ps = await client.get(f"{url}/api/ps")
-                    entry["loaded_models"] = ps.json().get("models", [])
-                except Exception as e:
-                    entry["loaded_models_error"] = str(e)
-                results[name] = entry
-        return json.dumps(results, indent=2)