FormicOS/docker-compose.yml at main · Intradyne/FormicOS · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
# FormicOS v2 — Docker Compose
#
# Cloud-first by default: 3 containers (FormicOS + Qdrant + Docker proxy).
# Set ANTHROPIC_API_KEY in .env and run: docker compose build && docker compose up -d
#
# For local GPU inference (5 containers): set COMPOSE_PROFILES=local-gpu in .env,
# or run: bash scripts/setup-local-gpu.sh
# See .env.example for all tunable knobs.

services:
  # ── Qdrant Vector Store (ADR-013) ─────────────
  qdrant:
    image: qdrant/qdrant:v1.16.2
    container_name: formicos-qdrant
    ports:
      - "6333:6333"
      - "6334:6334"
    volumes:
      - qdrant-data:/qdrant/storage
    restart: unless-stopped
    healthcheck:
      test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/6333' || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 10s

  # ── Embedding Sidecar — Qwen3-Embedding-0.6B (Wave 13) ──
  # Decoder-only model via llama.cpp with last-token pooling.
  # ~700 MB VRAM (Q8_0) at --n-gpu-layers 99.
  # Set EMBED_GPU_LAYERS=0 to move embedding to CPU, freeing ~700 MB VRAM.
  #
  # Download model:
  #   huggingface-cli download Qwen/Qwen3-Embedding-0.6B-GGUF \
  #     Qwen3-Embedding-0.6B-Q8_0.gguf --local-dir .models
  formicos-embed:
    profiles: [local-gpu]
    image: ${EMBED_IMAGE:-${LLM_IMAGE:-local/llama.cpp:server-cuda-blackwell}}
    container_name: formicos-embed
    ports:
      - "8200:8200"
    volumes:
      - ${LLM_MODEL_DIR:-./.models}:/models:ro
    environment:
      # GPU pinning: embedding can run on a secondary GPU to free VRAM
      # on the primary. Set CUDA_DEVICE_EMBED in .env.
      - CUDA_VISIBLE_DEVICES=${CUDA_DEVICE_EMBED:-${CUDA_DEVICE:-0}}
    command: >
      --model /models/Qwen3-Embedding-0.6B-Q8_0.gguf
      --embedding
      --pooling last
      -ub 8192
      --port 8200
      --host 0.0.0.0
      --n-gpu-layers ${EMBED_GPU_LAYERS:-99}
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['${CUDA_DEVICE_EMBED:-${CUDA_DEVICE:-0}}']
              capabilities: [gpu]
    restart: unless-stopped
    healthcheck:
      test: ["CMD-SHELL", "curl -sf http://localhost:8200/health || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 30s

  # ── Docker Socket Proxy (Wave 43) ──────────────
  # Restricts Docker API access to the minimum operations needed for
  # sandbox container spawning. The FormicOS backend connects through
  # this proxy instead of the raw socket.
  #
  # SECURITY: This is a MITIGATION, not a complete fix. For stronger
  # isolation, use Sysbox or gVisor (see docker-compose.sysbox.yml).
  docker-proxy:
    image: tecnativa/docker-socket-proxy:latest
    container_name: formicos-docker-proxy
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock:ro
    environment:
      # Allow only the operations needed for sandbox spawning
      - CONTAINERS=1     # create/start/stop/remove containers
      - POST=1           # allow POST (create/start)
      - IMAGES=0         # no image pull/build through proxy
      - NETWORKS=0       # no network management
      - VOLUMES=0        # no volume management
      - SERVICES=0       # no swarm services
      - NODES=0          # no swarm nodes
      - BUILD=0          # no image builds
      - EXEC=0           # no exec into running containers
      - SWARM=0          # no swarm operations
    restart: unless-stopped
    healthcheck:
      test: ["CMD-SHELL", "wget -qO- http://localhost:2375/_ping || exit 1"]
      interval: 10s
      timeout: 3s
      retries: 3
      start_period: 5s

  # ── FormicOS Colony OS ──────────────────────────
  formicos:
    build: .
    container_name: formicos-colony
    cap_drop:
      - NET_RAW
      - SYS_CHROOT
      - MKNOD
      - AUDIT_WRITE
    ports:
      - "8080:8080"
    volumes:
      - formicos-data:/data
      # Wave 81: bind real project root (read-write).
      # Set PROJECT_DIR in .env to your project path. Default: current directory.
      - ${PROJECT_DIR:-.}:/project
      # Dev-only: mount benchmark exercises for evaluation runs.
      # Uncomment and set BENCHMARK_DIR in .env to your local path.
      # - ${BENCHMARK_DIR:-/dev/null}:/benchmark:ro
      # Wave 43: No raw Docker socket mount. Use the socket proxy instead.
      # To bypass the proxy (dev only), uncomment the line below and remove
      # DOCKER_HOST, then remove the docker-proxy service dependency.
      # - /var/run/docker.sock:/var/run/docker.sock
    env_file:
      - path: ${FORMICOS_ENV_FILE:-.env}
        required: false
    environment:
      - FORMICOS_DATA_DIR=/data
      - PROJECT_DIR=/project
      - LLM_HOST=${LLM_HOST:-}
      - QDRANT_URL=http://qdrant:6333
      - EMBED_URL=${EMBED_URL:-}
      # Wave 43: Route Docker commands through the socket proxy
      - DOCKER_HOST=tcp://docker-proxy:2375
    depends_on:
      # Profile-gated services (llm, formicos-embed) cannot be hard
      # dependencies — they only start with COMPOSE_PROFILES=local-gpu.
      # Adapters handle missing endpoints gracefully at first call.
      qdrant:
        condition: service_healthy
      docker-proxy:
        condition: service_healthy
    restart: unless-stopped
    healthcheck:
      test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8080/health')\" || exit 1"]
      interval: 15s
      timeout: 5s
      retries: 3
      start_period: 30s

  # ── OPTIONAL: HTTPS Reverse Proxy — Caddy ──────────
  # Not needed for local Claude Desktop or Claude Code connections.
  # Claude Desktop uses mcp-remote (stdio→HTTP bridge) over localhost.
  # Enable HTTPS only for production/exposed deployments.
  #
  # To enable: uncomment below, or use the override file:
  #   docker compose -f docker-compose.yml -f docker-compose.https.yml up -d
  #
  # First-time setup on host:
  #   mkcert -install
  #   mkdir -p certs
  #   mkcert -cert-file certs/localhost.pem -key-file certs/localhost-key.pem localhost 127.0.0.1 ::1
  #
  # caddy:
  #   image: caddy:2-alpine
  #   container_name: formicos-caddy
  #   ports:
  #     - "8443:8443"
  #   volumes:
  #     - ./Caddyfile:/etc/caddy/Caddyfile:ro
  #     - ./certs:/certs:ro
  #   depends_on:
  #     formicos:
  #       condition: service_healthy
  #   restart: unless-stopped
  #   healthcheck:
  #     test: ["CMD-SHELL", "wget -qO- --no-check-certificate https://localhost:8443/health || exit 1"]
  #     interval: 10s
  #     timeout: 5s
  #     retries: 3
  #     start_period: 5s

  # ── LLM Inference — llama.cpp (GPU) ─────────────
  # Qwen3.5-35B-A3B: MoE with 3.5B active params per token.
  # Alias and chat-template args are configurable so the same service can
  # host alternative local models without code changes.
  #
  # VRAM budget (RTX 5090 32GB, Qwen3.5-35B-A3B Q4_K_M, 65536 ctx):
  #   Model weights:  ~19.5 GB (Q4_K_M, MoE — only 3.5B active per token)
  #   KV cache:       ~5.5 GB (65536 ctx x 2 slots x bf16)
  #   Compute buffers: ~2.4 GB
  #   Embed sidecar:   ~0.7 GB (Qwen3-Embedding-0.6B Q8_0)
  #   Prompt cache:    ~1 GB (--cache-ram, in system RAM not VRAM)
  #   ─────────────────────────
  #   Total GPU:      ~29.1 GB — leaves ~2.9 GB headroom
  #   --fit on auto-sizes KV cache to available VRAM (OOM safety net).
  #
  # Requires Blackwell-native image for full 65536 context.
  # Build with: bash scripts/build_llm_image.sh
  # Fallback: LLM_IMAGE=ghcr.io/ggml-org/llama.cpp:server-cuda (PTX JIT, ~16k effective).
  #
  # COUPLING: LLM_SLOTS must match the adapter concurrency limit.
  # The adapter reads LLM_SLOTS from the environment automatically.
  llm:
    profiles: [local-gpu]
    image: ${LLM_IMAGE:-local/llama.cpp:server-cuda-blackwell}
    container_name: formicos-llm
    ports:
      - "${LLM_PORT:-8008}:8080"
    volumes:
      - ${LLM_MODEL_DIR:-./.models}:/models:ro
      - ./config:/config:ro
    environment:
      # GPU pinning: CUDA_VISIBLE_DEVICES is the effective control on
      # Docker Desktop / WSL2 (device_ids in deploy is ignored there).
      # Set CUDA_DEVICE in .env to target a different GPU index.
      - CUDA_VISIBLE_DEVICES=${CUDA_DEVICE:-0}
      - GGML_CUDA_GRAPH_OPT=1
    command: >
      --model /models/${LLM_MODEL_FILE:-Qwen3.5-35B-A3B-Q4_K_M.gguf}
      --alias ${LLM_MODEL_ALIAS:-qwen3.5-35b}
      --ctx-size ${LLM_CONTEXT_SIZE:-65536}
      --n-gpu-layers 99
      --flash-attn ${LLM_FLASH_ATTN:-on}
      --fit on
      --kv-unified
      --cache-type-k ${LLM_CACHE_TYPE_K:-q4_0}
      --cache-type-v ${LLM_CACHE_TYPE_V:-q4_0}
      --batch-size ${LLM_BATCH_SIZE:-8192}
      --ubatch-size ${LLM_UBATCH_SIZE:-4096}
      --threads 8
      --threads-batch 16
      --jinja
      ${LLM_CHAT_TEMPLATE_ARGS:---chat-template-file /config/qwen35-chat.jinja}
      --slots
      -np ${LLM_SLOTS:-2}
      -sps ${LLM_SLOT_PROMPT_SIMILARITY:-0.5}
      --cache-ram ${LLM_CACHE_RAM:-1024}
      --host 0.0.0.0
      --port 8080
      ${LLM_SPEC_ARGS:-}
    ipc: host
    deploy:
      resources:
        reservations:
          devices:
            # device_ids works on native Linux; ignored on Docker Desktop / WSL2.
            - driver: nvidia
              device_ids: ['${CUDA_DEVICE:-0}']
              capabilities: [gpu]
    restart: unless-stopped
    healthcheck:
      test: ["CMD-SHELL", "curl -sf http://localhost:8080/health || exit 1"]
      interval: 10s
      timeout: 10s
      retries: 5
      start_period: 120s

  # ── ALTERNATIVE: Ollama (simpler, heavier) ──────
  # Uncomment this and comment out the llm service above if you prefer Ollama.
  # Then change LLM_HOST to http://ollama:11434 and model defaults to ollama/*.
  #
  # VRAM budget (RTX 5090 32GB, llama3.3 Q4_K_M 70B):
  #   Model weights:  ~40 GB (Q4_K_M 70B) — tight fit
  #   KV cache:       reduced by q8_0 quantization
  #   ─────────────────────────
  #   Fits with KV cache quantization enabled.
  #   For better headroom, use Qwen3-30B-A3B (~18.6 GB weights).
  #
  # ollama:
  #   image: ollama/ollama
  #   container_name: formicos-ollama
  #   ports:
  #     - "11434:11434"
  #   volumes:
  #     - ollama-data:/root/.ollama
  #   environment:
  #     - OLLAMA_FLASH_ATTENTION=1
  #     - OLLAMA_GPU_OVERHEAD=0
  #     - OLLAMA_NUM_PARALLEL=4
  #     - OLLAMA_MAX_LOADED_MODELS=2
  #     - OLLAMA_KV_CACHE_TYPE=q8_0
  #     - CUDA_VISIBLE_DEVICES=0
  #   deploy:
  #     resources:
  #       reservations:
  #         devices:
  #           - driver: nvidia
  #             count: all
  #             capabilities: [gpu]
  #   restart: unless-stopped
  #   healthcheck:
  #     test: ["CMD-SHELL", "curl -sf http://localhost:11434/api/tags || exit 1"]
  #     interval: 10s
  #     timeout: 10s
  #     retries: 5
  #     start_period: 60s

volumes:
  formicos-data:
  qdrant-data:
  # ollama-data: