Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,22 @@
STACK_USER=yourusername
OLLAMA_DATA=/home/${STACK_USER}/.ollama

# ─── Intel GPU device nodes ───────────────────────────────────────────────────
GPU_CARD=/dev/dri/card1
GPU_RENDER=/dev/dri/renderD128

# ── Ollama ────────────────────────────────────────────────────────────────────
OLLAMA_PORT=11434
OLLAMA_SSH_KEY=/home/${STACK_USER}/.ssh/id_ed25519_ollama
OLLAMA_SSH_KEY_PUB=/home/${STACK_USER}/.ssh/id_ed25519_ollama.pub
# Memory limits — adjust for your hardware.
# Intel Arc users: 16gb / 32g is a good starting point for a 32GB system.
# NVIDIA GPU users: match shm to your VRAM size.
# CPU-only users: 2gb / 8g (defaults set in docker-compose.yml).
OLLAMA_SHM_SIZE=2gb
OLLAMA_MEM_LIMIT=8g

# ── Intel Arc GPU device nodes (only needed with docker-compose.arc.yml) ──────
# Run scripts/check-arc-gpu.sh to find the correct device paths for your system.
# These values are ignored when using the CPU or NVIDIA compose overlay.
GPU_CARD=/dev/dri/card1
GPU_RENDER=/dev/dri/renderD128

# ── Cloud API Keys ────────────────────────────────────────────────────────────
ANTHROPIC_API_KEY=sk-ant-...
Expand All @@ -39,7 +47,8 @@ OPENAI_API_KEY= # optional — leave blank if not using OpenAI

# ── LiteLLM ───────────────────────────────────────────────────────────────────
LITELLM_PORT=4000
LITELLM_MASTER_KEY=sk-local-admin-changeme
# Generate a secure key with: scripts/generate-keys.sh
LITELLM_MASTER_KEY=sk-change-this-before-deployment

# ── Olla — unified LLM router ─────────────────────────────────────────────────
OLLA_PORT=40114
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ jobs:
docker compose version

- name: Validate docker-compose.yml syntax
env:
LITELLM_MASTER_KEY: ci-validation-key
STACK_USER: ci-user
run: docker compose config --quiet

- name: Lint shell scripts
Expand Down
34 changes: 34 additions & 0 deletions docker-compose.arc.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# docker-compose.arc.yml — Intel Arc iGPU overlay
#
# Use with:
# docker compose -f docker-compose.yml -f docker-compose.arc.yml up -d
#
# Requires:
# - Intel Arc GPU (checked by scripts/check-arc-gpu.sh)
# - GPU_CARD and GPU_RENDER set in .env (see .env.example)
# - OLLAMA_SHM_SIZE and OLLAMA_MEM_LIMIT sized for your GPU (default 16gb/32g)
#
# See docs/hardware/arc.md for setup guide and model recommendations.
#
# Source image: https://github.com/Ava-AgentOne/ollama-intel
# Replaces intelanalytics/ipex-llm-inference-cpp-xpu (archived Jan 28, 2026)

services:
ollama:
image: ghcr.io/ava-agentone/ollama-intel:latest
container_name: ollama
environment:
- DEVICE=Arc
- OLLAMA_INTEL_GPU=true
- ONEAPI_DEVICE_SELECTOR=level_zero:0
- ZES_ENABLE_SYSMAN=1
- OLLAMA_DEBUG=1
devices:
# GPU card node — may be card0 or card1 depending on boot order.
# Run scripts/check-arc-gpu.sh to find the correct device paths.
- ${GPU_CARD:-/dev/dri/card1}:${GPU_CARD:-/dev/dri/card1}
- ${GPU_RENDER:-/dev/dri/renderD128}:${GPU_RENDER:-/dev/dri/renderD128}
shm_size: '${OLLAMA_SHM_SIZE:-16gb}'
mem_limit: '${OLLAMA_MEM_LIMIT:-32g}'
healthcheck:
start_period: 60s # Arc iGPU needs extra time for driver init
30 changes: 30 additions & 0 deletions docker-compose.nvidia.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# docker-compose.nvidia.yml — NVIDIA GPU overlay
#
# Use with:
# docker compose -f docker-compose.yml -f docker-compose.nvidia.yml up -d
#
# Requires:
# - NVIDIA GPU with CUDA support
# - NVIDIA Container Toolkit installed:
# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html
# - nvidia-smi working on the host
# - OLLAMA_SHM_SIZE and OLLAMA_MEM_LIMIT sized for your GPU (default 8gb/16g)
#
# See docs/hardware/nvidia.md for setup guide and model recommendations.

services:
ollama:
image: ollama/ollama:latest
container_name: ollama
environment:
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
shm_size: '${OLLAMA_SHM_SIZE:-8gb}'
mem_limit: '${OLLAMA_MEM_LIMIT:-16g}'
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
70 changes: 39 additions & 31 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,20 +1,27 @@
###############################################################################
# AI Stack — with Intel Arc iGPU + cloud models + LiteLLM, unified via Olla router
# AI Stack — local LLMs + cloud models + LiteLLM, unified via Olla router
#
# Topology:
#
# Retriever ──[embed]──▶ Olla :40114 ──▶ ollama-arc :11434 (Intel Arc iGPU)
# Retriever ──[embed]──▶ Olla :40114 ──▶ ollama :11434 (local LLM)
# OpenCode ──[tool / provider]──▶ Olla :40114
# ──[provider]───────▶ LiteLLM :4000 ──▶ Claude (Anthropic)
# ──▶ Gemini (Google)
#
# All inference through Olla for unified routing + load balancing.
# OpenCode is the primary AI interface (CLI + Obsidian sidebar plugin).
#
# Hardware profiles (pick one):
# CPU / generic: docker compose up -d (this file)
# Intel Arc iGPU: docker compose -f docker-compose.yml -f docker-compose.arc.yml up -d
# NVIDIA GPU: docker compose -f docker-compose.yml -f docker-compose.nvidia.yml up -d
#
# See docs/hardware/ for requirements, tuning, and troubleshooting.
#
# Quick start:
# 1. cp .env.example .env && nano .env
# 2. ./check-arc-gpu.sh # validate GPU_CARD / GPU_RENDER device paths
# 3. docker compose up -d
# 2. scripts/generate-olla-config.sh # generate proxy/olla.yaml
# 3. docker compose up -d # add -f overlay for your GPU
#
# Service URLs:
# Olla status http://localhost:40114/internal/status/endpoints
Expand All @@ -24,36 +31,33 @@

services:

# ─── Ollama (Intel Arc iGPU) ───────────────────────────────────────────────
# Uses ava-agentone/ollama-intel: community-maintained, iGPU-first
# Replaces intelanalytics/ipex-llm-inference-cpp-xpu (archived Jan 28, 2026)
# Source: https://github.com/Ava-AgentOne/ollama-intel
ollama-arc:
image: ghcr.io/ava-agentone/ollama-intel:latest
container_name: ollama-arc
# ─── Ollama — local LLM inference ─────────────────────────────────────────
# Default: standard Ollama image (CPU / CUDA).
# For GPU acceleration, add a hardware overlay:
# Intel Arc: -f docker-compose.arc.yml
# NVIDIA: -f docker-compose.nvidia.yml
ollama:
image: ollama/ollama:latest
container_name: ollama
restart: unless-stopped
environment:
- OLLAMA_HOST=0.0.0.0
- OLLAMA_ORIGINS=app://obsidian.md*
- OLLAMA_DEBUG=1
- DEVICE=Arc
- OLLAMA_INTEL_GPU=true
- ONEAPI_DEVICE_SELECTOR=level_zero:0
- ZES_ENABLE_SYSMAN=1
- OLLAMA_KEEP_ALIVE=-1
volumes:
- ${OLLAMA_DATA:-/home/user/.ollama}:/root/.ollama
- ${OLLAMA_SSH_KEY:-/dev/null}:/root/.ollama/id_ed25519:ro
- ${OLLAMA_SSH_KEY_PUB:-/dev/null}:/root/.ollama/id_ed25519.pub:ro
devices:
# GPU card node — may be card0 or card1 depending on boot
# check-arc-gpu.sh validates this before stack starts
- ${GPU_CARD:-/dev/dri/card1}:${GPU_CARD:-/dev/dri/card1}
- ${GPU_RENDER:-/dev/dri/renderD128}:${GPU_RENDER:-/dev/dri/renderD128}
ports:
- "${OLLAMA_PORT:-11434}:11434"
shm_size: '16gb'
mem_limit: 32g
shm_size: '${OLLAMA_SHM_SIZE:-2gb}'
mem_limit: '${OLLAMA_MEM_LIMIT:-8g}'
healthcheck:
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:11434/"]
interval: 15s
timeout: 10s
retries: 5
start_period: 30s
networks:
- ai-net

Expand All @@ -73,7 +77,7 @@ services:
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
- GEMINI_API_KEY=${GEMINI_API_KEY}
- OPENAI_API_KEY=${OPENAI_API_KEY:-not-set}
- LITELLM_MASTER_KEY=${LITELLM_MASTER_KEY:-sk-local-admin}
- LITELLM_MASTER_KEY=${LITELLM_MASTER_KEY:-sk-changeme-set-in-env}
command: ["--config", "/app/config.yaml", "--port", "4000"]
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:4000/health/liveness?key=${LITELLM_MASTER_KEY:-sk-local-admin}')"]
Expand All @@ -85,7 +89,7 @@ services:
- ai-net

# ─── Olla — LLM proxy & load balancer ─────────────────────────────────────
# Single unified endpoint for all models — local Arc GPU + cloud via LiteLLM.
# Single unified endpoint for all models — local Ollama + cloud via LiteLLM.
# LAN node config is NOT baked in here — declare extra nodes as OLLAMA_REMOTE_*
# variables in your .env, then run scripts/generate-olla-config.sh to produce
# proxy/olla.yaml before bringing up the stack.
Expand All @@ -100,8 +104,10 @@ services:
- ./proxy/olla.yaml:/app/config.yaml:ro # generated by scripts/generate-olla-config.sh — in .gitignore
- olla-logs:/app/logs
depends_on:
- ollama-arc
- litellm
ollama:
condition: service_healthy
litellm:
condition: service_healthy
healthcheck:
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:40114/internal/health"]
interval: 30s
Expand All @@ -116,7 +122,7 @@ services:
# Uses sqlite-vec for vector storage, FTS5 for keyword search,
# and watchdog for live vault indexing.
# Hybrid search: BM25 (FTS5) + vector similarity, fused via RRF.
# Embeddings via Olla → ollama-arc (nomic-embed-text).
# Embeddings via Olla → ollama (nomic-embed-text).
# API-only: no web UI. Intended for use as an OpenCode tool.
retriever:
build: ./retriever
Expand All @@ -135,7 +141,8 @@ services:
- DB_PATH=/data/retriever.db
- VAULT_PATH=/vault
depends_on:
- olla
olla:
condition: service_healthy
networks:
- ai-net

Expand All @@ -154,7 +161,8 @@ services:
- LISTEN_HOST=0.0.0.0
- LISTEN_PORT=40115
depends_on:
- olla
olla:
condition: service_healthy
networks:
- ai-net

Expand All @@ -165,4 +173,4 @@ volumes:

# ─── Networks ─────────────────────────────────────────────────────────────────
networks:
ai-net:
ai-net:
10 changes: 5 additions & 5 deletions scripts/generate-olla-config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# OLLAMA_REMOTE_NAS=http://192.168.1.51:11434
#
# Fixed nodes (always present, not configurable):
# priority 100 — ollama-arc (local Intel Arc iGPU)
# priority 100 — ollama (local Ollama node — Arc, NVIDIA, or CPU depending on overlay)
# priority 50 — litellm (cloud gateway: Claude, Gemini)
#
# Advanced tuning (optional .env vars):
Expand Down Expand Up @@ -82,7 +82,7 @@ fi

echo "→ Generating olla.yaml..."
if [[ ${#REMOTE_URLS[@]} -eq 0 ]]; then
echo " (no OLLAMA_REMOTE_* entries found — only local Arc node + LiteLLM)"
echo " (no OLLAMA_REMOTE_* entries found — only local Ollama node + LiteLLM)"
fi

# ── Write olla.yaml ────────────────────────────────────────────────────────────
Expand All @@ -108,9 +108,9 @@ discovery:
static:
endpoints:

# ── Local Intel Arc iGPU node (priority 100) ─────────────────
- url: "http://ollama-arc:11434"
name: "ollama-arc"
# ── Local Ollama node (priority 100) ─────────────────
- url: "http://ollama:11434"
name: "ollama"
type: "ollama"
priority: 100
check_interval: 15s
Expand Down