diff --git a/.env.example b/.env.example index 2784c41..71e09c3 100644 --- a/.env.example +++ b/.env.example @@ -23,14 +23,22 @@ STACK_USER=yourusername OLLAMA_DATA=/home/${STACK_USER}/.ollama -# ─── Intel GPU device nodes ─────────────────────────────────────────────────── -GPU_CARD=/dev/dri/card1 -GPU_RENDER=/dev/dri/renderD128 - # ── Ollama ──────────────────────────────────────────────────────────────────── OLLAMA_PORT=11434 OLLAMA_SSH_KEY=/home/${STACK_USER}/.ssh/id_ed25519_ollama OLLAMA_SSH_KEY_PUB=/home/${STACK_USER}/.ssh/id_ed25519_ollama.pub +# Memory limits — adjust for your hardware. +# Intel Arc users: 16gb / 32g is a good starting point for a 32GB system. +# NVIDIA GPU users: match shm to your VRAM size. +# CPU-only users: 2gb / 8g (defaults set in docker-compose.yml). +OLLAMA_SHM_SIZE=2gb +OLLAMA_MEM_LIMIT=8g + +# ── Intel Arc GPU device nodes (only needed with docker-compose.arc.yml) ────── +# Run scripts/check-arc-gpu.sh to find the correct device paths for your system. +# These values are ignored when using the CPU or NVIDIA compose overlay. +GPU_CARD=/dev/dri/card1 +GPU_RENDER=/dev/dri/renderD128 # ── Cloud API Keys ──────────────────────────────────────────────────────────── ANTHROPIC_API_KEY=sk-ant-... @@ -39,7 +47,8 @@ OPENAI_API_KEY= # optional — leave blank if not using OpenAI # ── LiteLLM ─────────────────────────────────────────────────────────────────── LITELLM_PORT=4000 -LITELLM_MASTER_KEY=sk-local-admin-changeme +# Generate a secure key with: scripts/generate-keys.sh +LITELLM_MASTER_KEY=sk-change-this-before-deployment # ── Olla — unified LLM router ───────────────────────────────────────────────── OLLA_PORT=40114 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 61f06ec..89bf2dd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,6 +23,9 @@ jobs: docker compose version - name: Validate docker-compose.yml syntax + env: + LITELLM_MASTER_KEY: ci-validation-key + STACK_USER: ci-user run: docker compose config --quiet - name: Lint shell scripts diff --git a/docker-compose.arc.yml b/docker-compose.arc.yml new file mode 100644 index 0000000..eba8c0a --- /dev/null +++ b/docker-compose.arc.yml @@ -0,0 +1,34 @@ +# docker-compose.arc.yml — Intel Arc iGPU overlay +# +# Use with: +# docker compose -f docker-compose.yml -f docker-compose.arc.yml up -d +# +# Requires: +# - Intel Arc GPU (checked by scripts/check-arc-gpu.sh) +# - GPU_CARD and GPU_RENDER set in .env (see .env.example) +# - OLLAMA_SHM_SIZE and OLLAMA_MEM_LIMIT sized for your GPU (default 16gb/32g) +# +# See docs/hardware/arc.md for setup guide and model recommendations. +# +# Source image: https://github.com/Ava-AgentOne/ollama-intel +# Replaces intelanalytics/ipex-llm-inference-cpp-xpu (archived Jan 28, 2026) + +services: + ollama: + image: ghcr.io/ava-agentone/ollama-intel:latest + container_name: ollama + environment: + - DEVICE=Arc + - OLLAMA_INTEL_GPU=true + - ONEAPI_DEVICE_SELECTOR=level_zero:0 + - ZES_ENABLE_SYSMAN=1 + - OLLAMA_DEBUG=1 + devices: + # GPU card node — may be card0 or card1 depending on boot order. + # Run scripts/check-arc-gpu.sh to find the correct device paths. + - ${GPU_CARD:-/dev/dri/card1}:${GPU_CARD:-/dev/dri/card1} + - ${GPU_RENDER:-/dev/dri/renderD128}:${GPU_RENDER:-/dev/dri/renderD128} + shm_size: '${OLLAMA_SHM_SIZE:-16gb}' + mem_limit: '${OLLAMA_MEM_LIMIT:-32g}' + healthcheck: + start_period: 60s # Arc iGPU needs extra time for driver init diff --git a/docker-compose.nvidia.yml b/docker-compose.nvidia.yml new file mode 100644 index 0000000..6f6579d --- /dev/null +++ b/docker-compose.nvidia.yml @@ -0,0 +1,30 @@ +# docker-compose.nvidia.yml — NVIDIA GPU overlay +# +# Use with: +# docker compose -f docker-compose.yml -f docker-compose.nvidia.yml up -d +# +# Requires: +# - NVIDIA GPU with CUDA support +# - NVIDIA Container Toolkit installed: +# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html +# - nvidia-smi working on the host +# - OLLAMA_SHM_SIZE and OLLAMA_MEM_LIMIT sized for your GPU (default 8gb/16g) +# +# See docs/hardware/nvidia.md for setup guide and model recommendations. + +services: + ollama: + image: ollama/ollama:latest + container_name: ollama + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + shm_size: '${OLLAMA_SHM_SIZE:-8gb}' + mem_limit: '${OLLAMA_MEM_LIMIT:-16g}' + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] diff --git a/docker-compose.yml b/docker-compose.yml index 896e738..1203775 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,9 +1,9 @@ ############################################################################### -# AI Stack — with Intel Arc iGPU + cloud models + LiteLLM, unified via Olla router +# AI Stack — local LLMs + cloud models + LiteLLM, unified via Olla router # # Topology: # -# Retriever ──[embed]──▶ Olla :40114 ──▶ ollama-arc :11434 (Intel Arc iGPU) +# Retriever ──[embed]──▶ Olla :40114 ──▶ ollama :11434 (local LLM) # OpenCode ──[tool / provider]──▶ Olla :40114 # ──[provider]───────▶ LiteLLM :4000 ──▶ Claude (Anthropic) # ──▶ Gemini (Google) @@ -11,10 +11,17 @@ # All inference through Olla for unified routing + load balancing. # OpenCode is the primary AI interface (CLI + Obsidian sidebar plugin). # +# Hardware profiles (pick one): +# CPU / generic: docker compose up -d (this file) +# Intel Arc iGPU: docker compose -f docker-compose.yml -f docker-compose.arc.yml up -d +# NVIDIA GPU: docker compose -f docker-compose.yml -f docker-compose.nvidia.yml up -d +# +# See docs/hardware/ for requirements, tuning, and troubleshooting. +# # Quick start: # 1. cp .env.example .env && nano .env -# 2. ./check-arc-gpu.sh # validate GPU_CARD / GPU_RENDER device paths -# 3. docker compose up -d +# 2. scripts/generate-olla-config.sh # generate proxy/olla.yaml +# 3. docker compose up -d # add -f overlay for your GPU # # Service URLs: # Olla status http://localhost:40114/internal/status/endpoints @@ -24,36 +31,33 @@ services: - # ─── Ollama (Intel Arc iGPU) ─────────────────────────────────────────────── - # Uses ava-agentone/ollama-intel: community-maintained, iGPU-first - # Replaces intelanalytics/ipex-llm-inference-cpp-xpu (archived Jan 28, 2026) - # Source: https://github.com/Ava-AgentOne/ollama-intel - ollama-arc: - image: ghcr.io/ava-agentone/ollama-intel:latest - container_name: ollama-arc + # ─── Ollama — local LLM inference ───────────────────────────────────────── + # Default: standard Ollama image (CPU / CUDA). + # For GPU acceleration, add a hardware overlay: + # Intel Arc: -f docker-compose.arc.yml + # NVIDIA: -f docker-compose.nvidia.yml + ollama: + image: ollama/ollama:latest + container_name: ollama restart: unless-stopped environment: - OLLAMA_HOST=0.0.0.0 - OLLAMA_ORIGINS=app://obsidian.md* - - OLLAMA_DEBUG=1 - - DEVICE=Arc - - OLLAMA_INTEL_GPU=true - - ONEAPI_DEVICE_SELECTOR=level_zero:0 - - ZES_ENABLE_SYSMAN=1 - OLLAMA_KEEP_ALIVE=-1 volumes: - ${OLLAMA_DATA:-/home/user/.ollama}:/root/.ollama - ${OLLAMA_SSH_KEY:-/dev/null}:/root/.ollama/id_ed25519:ro - ${OLLAMA_SSH_KEY_PUB:-/dev/null}:/root/.ollama/id_ed25519.pub:ro - devices: - # GPU card node — may be card0 or card1 depending on boot - # check-arc-gpu.sh validates this before stack starts - - ${GPU_CARD:-/dev/dri/card1}:${GPU_CARD:-/dev/dri/card1} - - ${GPU_RENDER:-/dev/dri/renderD128}:${GPU_RENDER:-/dev/dri/renderD128} ports: - "${OLLAMA_PORT:-11434}:11434" - shm_size: '16gb' - mem_limit: 32g + shm_size: '${OLLAMA_SHM_SIZE:-2gb}' + mem_limit: '${OLLAMA_MEM_LIMIT:-8g}' + healthcheck: + test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:11434/"] + interval: 15s + timeout: 10s + retries: 5 + start_period: 30s networks: - ai-net @@ -73,7 +77,7 @@ services: - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} - GEMINI_API_KEY=${GEMINI_API_KEY} - OPENAI_API_KEY=${OPENAI_API_KEY:-not-set} - - LITELLM_MASTER_KEY=${LITELLM_MASTER_KEY:-sk-local-admin} + - LITELLM_MASTER_KEY=${LITELLM_MASTER_KEY:-sk-changeme-set-in-env} command: ["--config", "/app/config.yaml", "--port", "4000"] healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:4000/health/liveness?key=${LITELLM_MASTER_KEY:-sk-local-admin}')"] @@ -85,7 +89,7 @@ services: - ai-net # ─── Olla — LLM proxy & load balancer ───────────────────────────────────── - # Single unified endpoint for all models — local Arc GPU + cloud via LiteLLM. + # Single unified endpoint for all models — local Ollama + cloud via LiteLLM. # LAN node config is NOT baked in here — declare extra nodes as OLLAMA_REMOTE_* # variables in your .env, then run scripts/generate-olla-config.sh to produce # proxy/olla.yaml before bringing up the stack. @@ -100,8 +104,10 @@ services: - ./proxy/olla.yaml:/app/config.yaml:ro # generated by scripts/generate-olla-config.sh — in .gitignore - olla-logs:/app/logs depends_on: - - ollama-arc - - litellm + ollama: + condition: service_healthy + litellm: + condition: service_healthy healthcheck: test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:40114/internal/health"] interval: 30s @@ -116,7 +122,7 @@ services: # Uses sqlite-vec for vector storage, FTS5 for keyword search, # and watchdog for live vault indexing. # Hybrid search: BM25 (FTS5) + vector similarity, fused via RRF. - # Embeddings via Olla → ollama-arc (nomic-embed-text). + # Embeddings via Olla → ollama (nomic-embed-text). # API-only: no web UI. Intended for use as an OpenCode tool. retriever: build: ./retriever @@ -135,7 +141,8 @@ services: - DB_PATH=/data/retriever.db - VAULT_PATH=/vault depends_on: - - olla + olla: + condition: service_healthy networks: - ai-net @@ -154,7 +161,8 @@ services: - LISTEN_HOST=0.0.0.0 - LISTEN_PORT=40115 depends_on: - - olla + olla: + condition: service_healthy networks: - ai-net @@ -165,4 +173,4 @@ volumes: # ─── Networks ───────────────────────────────────────────────────────────────── networks: - ai-net: \ No newline at end of file + ai-net: diff --git a/scripts/generate-olla-config.sh b/scripts/generate-olla-config.sh index 261af15..c0b19ee 100755 --- a/scripts/generate-olla-config.sh +++ b/scripts/generate-olla-config.sh @@ -16,7 +16,7 @@ # OLLAMA_REMOTE_NAS=http://192.168.1.51:11434 # # Fixed nodes (always present, not configurable): -# priority 100 — ollama-arc (local Intel Arc iGPU) +# priority 100 — ollama (local Ollama node — Arc, NVIDIA, or CPU depending on overlay) # priority 50 — litellm (cloud gateway: Claude, Gemini) # # Advanced tuning (optional .env vars): @@ -82,7 +82,7 @@ fi echo "→ Generating olla.yaml..." if [[ ${#REMOTE_URLS[@]} -eq 0 ]]; then - echo " (no OLLAMA_REMOTE_* entries found — only local Arc node + LiteLLM)" + echo " (no OLLAMA_REMOTE_* entries found — only local Ollama node + LiteLLM)" fi # ── Write olla.yaml ──────────────────────────────────────────────────────────── @@ -108,9 +108,9 @@ discovery: static: endpoints: - # ── Local Intel Arc iGPU node (priority 100) ───────────────── - - url: "http://ollama-arc:11434" - name: "ollama-arc" + # ── Local Ollama node (priority 100) ───────────────── + - url: "http://ollama:11434" + name: "ollama" type: "ollama" priority: 100 check_interval: 15s